From b332828c39326b1dca617f387dd15d12e81cd5f0 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:10 +0530 Subject: hw-breakpoints: prepare the code for Hardware Breakpoint interfaces The generic hardware breakpoint interface provides an abstraction of hardware breakpoints in front of specific arch implementations for both kernel and user side breakpoints. This includes execution breakpoints and read/write breakpoints, also known as "watchpoints". This patch introduces header files containing constants, structure definitions and declaration of functions used by the hardware breakpoint core and x86 specific code. It also introduces an array based storage for the debug-register values in 'struct thread_struct', while modifying all users of debugreg member in the structure. [ Impact: add headers for new hardware breakpoint interface ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/a.out-core.h | 8 +++--- arch/x86/include/asm/debugreg.h | 29 +++++++++++++++++++ arch/x86/include/asm/hw_breakpoint.h | 55 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 8 +++--- arch/x86/kernel/process.c | 16 +++++------ arch/x86/kernel/ptrace.c | 16 +++++------ arch/x86/power/cpu_32.c | 8 +++--- arch/x86/power/cpu_64.c | 8 +++--- 8 files changed, 116 insertions(+), 32 deletions(-) create mode 100644 arch/x86/include/asm/hw_breakpoint.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index bb70e397aa84..fc4685dd6e4d 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -32,10 +32,10 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg0; - dump->u_debugreg[1] = current->thread.debugreg1; - dump->u_debugreg[2] = current->thread.debugreg2; - dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[0] = current->thread.debugreg[0]; + dump->u_debugreg[1] = current->thread.debugreg[1]; + dump->u_debugreg[2] = current->thread.debugreg[2]; + dump->u_debugreg[3] = current->thread.debugreg[3]; dump->u_debugreg[4] = 0; dump->u_debugreg[5] = 0; dump->u_debugreg[6] = current->thread.debugreg6; diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 3ea6f37be9e2..23439fbb1d0e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -18,6 +18,7 @@ #define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP2 (0x4) /* db2 */ #define DR_TRAP3 (0x8) /* db3 */ +#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) #define DR_STEP (0x4000) /* single-step */ #define DR_SWITCH (0x8000) /* task switch */ @@ -49,6 +50,8 @@ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ +#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */ +#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */ #define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ #define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ @@ -67,4 +70,30 @@ #define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ #define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ +/* + * HW breakpoint additions + */ +#ifdef __KERNEL__ + +/* For process management */ +extern void flush_thread_hw_breakpoint(struct task_struct *tsk); +extern int copy_thread_hw_breakpoint(struct task_struct *tsk, + struct task_struct *child, unsigned long clone_flags); + +/* For CPU management */ +extern void load_debug_registers(void); +static inline void hw_breakpoint_disable(void) +{ + /* Zero the control register for HW Breakpoint */ + set_debugreg(0UL, 7); + + /* Zero-out the individual HW breakpoint address registers */ + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); +} + +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h new file mode 100644 index 000000000000..1acb4d45de70 --- /dev/null +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -0,0 +1,55 @@ +#ifndef _I386_HW_BREAKPOINT_H +#define _I386_HW_BREAKPOINT_H + +#ifdef __KERNEL__ +#define __ARCH_HW_BREAKPOINT_H + +struct arch_hw_breakpoint { + char *name; /* Contains name of the symbol to set bkpt */ + unsigned long address; + u8 len; + u8 type; +}; + +#include +#include + +/* Available HW breakpoint length encodings */ +#define HW_BREAKPOINT_LEN_1 0x40 +#define HW_BREAKPOINT_LEN_2 0x44 +#define HW_BREAKPOINT_LEN_4 0x4c +#define HW_BREAKPOINT_LEN_EXECUTE 0x40 + +#ifdef CONFIG_X86_64 +#define HW_BREAKPOINT_LEN_8 0x48 +#endif + +/* Available HW breakpoint type encodings */ + +/* trigger on instruction execute */ +#define HW_BREAKPOINT_EXECUTE 0x80 +/* trigger on memory write */ +#define HW_BREAKPOINT_WRITE 0x81 +/* trigger on memory read or write */ +#define HW_BREAKPOINT_RW 0x83 + +/* Total number of available HW breakpoint registers */ +#define HBP_NUM 4 + +extern struct hw_breakpoint *hbp_kernel[HBP_NUM]; +DECLARE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]); +extern unsigned int hbp_user_refcount[HBP_NUM]; + +extern void arch_install_thread_hw_breakpoint(struct task_struct *tsk); +extern void arch_uninstall_thread_hw_breakpoint(void); +extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len); +extern int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, + struct task_struct *tsk); +extern void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk); +extern void arch_flush_thread_hw_breakpoint(struct task_struct *tsk); +extern void arch_update_kernel_hw_breakpoint(void *); +extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, + unsigned long val, void *data); +#endif /* __KERNEL__ */ +#endif /* _I386_HW_BREAKPOINT_H */ + diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0b2fab0051e0..448b34a8e393 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -29,6 +29,7 @@ struct mm_struct; #include #include +#define HBP_NUM 4 /* * Default implementation of macro that returns current * instruction pointer ("program counter"). @@ -431,12 +432,11 @@ struct thread_struct { unsigned long fs; unsigned long gs; /* Hardware debugging registers: */ - unsigned long debugreg0; - unsigned long debugreg1; - unsigned long debugreg2; - unsigned long debugreg3; + unsigned long debugreg[HBP_NUM]; unsigned long debugreg6; unsigned long debugreg7; + /* Hardware breakpoint info */ + struct hw_breakpoint *hbp[HBP_NUM]; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fb5dfb891f0f..291527cb438a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -106,10 +106,10 @@ void flush_thread(void) clear_tsk_thread_flag(tsk, TIF_DEBUG); - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; + tsk->thread.debugreg[0] = 0; + tsk->thread.debugreg[1] = 0; + tsk->thread.debugreg[2] = 0; + tsk->thread.debugreg[3] = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); @@ -194,10 +194,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); + set_debugreg(next->debugreg[0], 0); + set_debugreg(next->debugreg[1], 1); + set_debugreg(next->debugreg[2], 2); + set_debugreg(next->debugreg[3], 3); /* no 4 and 5 */ set_debugreg(next->debugreg6, 6); set_debugreg(next->debugreg7, 7); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c13..313be40be55a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -471,10 +471,10 @@ static int genregs_set(struct task_struct *target, static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) { switch (n) { - case 0: return child->thread.debugreg0; - case 1: return child->thread.debugreg1; - case 2: return child->thread.debugreg2; - case 3: return child->thread.debugreg3; + case 0: return child->thread.debugreg[0]; + case 1: return child->thread.debugreg[1]; + case 2: return child->thread.debugreg[2]; + case 3: return child->thread.debugreg[3]; case 6: return child->thread.debugreg6; case 7: return child->thread.debugreg7; } @@ -493,10 +493,10 @@ static int ptrace_set_debugreg(struct task_struct *child, return -EIO; switch (n) { - case 0: child->thread.debugreg0 = data; break; - case 1: child->thread.debugreg1 = data; break; - case 2: child->thread.debugreg2 = data; break; - case 3: child->thread.debugreg3 = data; break; + case 0: child->thread.debugreg[0] = data; break; + case 1: child->thread.debugreg[1] = data; break; + case 2: child->thread.debugreg[2] = data; break; + case 3: child->thread.debugreg[3] = data; break; case 6: if ((data & ~0xffffffffUL) != 0) diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index ce702c5b3a2c..519913948003 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -84,10 +84,10 @@ static void fix_processor_context(void) * Now maybe reload the debug registers */ if (current->thread.debugreg7) { - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); + set_debugreg(current->thread.debugreg[0], 0); + set_debugreg(current->thread.debugreg[1], 1); + set_debugreg(current->thread.debugreg[2], 2); + set_debugreg(current->thread.debugreg[3], 3); /* no 4 and 5 */ set_debugreg(current->thread.debugreg6, 6); set_debugreg(current->thread.debugreg7, 7); diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 5343540f2607..1e3bdcc959ff 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -163,10 +163,10 @@ static void fix_processor_context(void) * Now maybe reload the debug registers */ if (current->thread.debugreg7){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); + set_debugreg(current->thread.debugreg[0], 0); + set_debugreg(current->thread.debugreg[1], 1); + set_debugreg(current->thread.debugreg[2], 2); + set_debugreg(current->thread.debugreg[3], 3); /* no 4 and 5 */ loaddebug(¤t->thread, 6); loaddebug(¤t->thread, 7); -- cgit v1.2.2 From 0067f1297241ea567f2b22a455519752d70fcca9 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:57 +0530 Subject: hw-breakpoints: x86 architecture implementation of Hardware Breakpoint interfaces This patch introduces the arch-specific implementation of the generic hardware breakpoints in kernel/hw_breakpoint.c inside x86 specific directories. It contains functions which help to validate and serve requests using Hardware Breakpoint registers on x86 processors. [ fweisbec@gmail.com: fix conflict against kmemcheck ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/Kconfig | 1 + arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/hw_breakpoint.c | 382 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 384 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/hw_breakpoint.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index df9e885eee14..3033375ed6bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -46,6 +46,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_HW_BREAKPOINT config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 77df4d654ff9..cbc781829173 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..4867c9f3b5fb --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,382 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Unmasked kernel DR7 value */ +static unsigned long kdr7; + +/* + * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register. + * Used to clear and verify the status of bits corresponding to DR0 - DR3 + */ +static const unsigned long dr7_masks[HBP_NUM] = { + 0x000f0003, /* LEN0, R/W0, G0, L0 */ + 0x00f0000c, /* LEN1, R/W1, G1, L1 */ + 0x0f000030, /* LEN2, R/W2, G2, L2 */ + 0xf00000c0 /* LEN3, R/W3, G3, L3 */ +}; + + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) | + DR_GLOBAL_SLOWDOWN; + return bp_info; +} + +void arch_update_kernel_hw_breakpoint(void *unused) +{ + struct hw_breakpoint *bp; + int i, cpu = get_cpu(); + unsigned long temp_kdr7 = 0; + + /* Don't allow debug exceptions while we update the registers */ + set_debugreg(0UL, 7); + + for (i = hbp_kernel_pos; i < HBP_NUM; i++) { + per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i]; + if (bp) { + temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type); + set_debugreg(bp->info.address, i); + } + } + + /* No need to set DR6. Update the debug registers with kernel-space + * breakpoint values from kdr7 and user-space requests from the + * current process + */ + kdr7 = temp_kdr7; + set_debugreg(kdr7 | current->thread.debugreg7, 7); + put_cpu_no_resched(); +} + +/* + * Install the thread breakpoints in their debug registers. + */ +void arch_install_thread_hw_breakpoint(struct task_struct *tsk) +{ + struct thread_struct *thread = &(tsk->thread); + + switch (hbp_kernel_pos) { + case 4: + set_debugreg(thread->debugreg[3], 3); + case 3: + set_debugreg(thread->debugreg[2], 2); + case 2: + set_debugreg(thread->debugreg[1], 1); + case 1: + set_debugreg(thread->debugreg[0], 0); + default: + break; + } + + /* No need to set DR6 */ + set_debugreg((kdr7 | thread->debugreg7), 7); +} + +/* + * Install the debug register values for just the kernel, no thread. + */ +void arch_uninstall_thread_hw_breakpoint() +{ + /* Clear the user-space portion of debugreg7 by setting only kdr7 */ + set_debugreg(kdr7, 7); + +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case HW_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case HW_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case HW_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in user space. + */ +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va <= TASK_SIZE - len); +} + +/* + * Check for virtual address in kernel space. + */ +int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Store a breakpoint's encoded address, length, and type. + */ +static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk) +{ + /* + * User-space requests will always have the address field populated + * Symbol names from user-space are rejected + */ + if (tsk && bp->info.name) + return -EINVAL; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (bp->info.name) + bp->info.address = (unsigned long) + kallsyms_lookup_name(bp->info.name); + if (bp->info.address) + return 0; + return -EINVAL; +} + +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, + struct task_struct *tsk) +{ + unsigned int align; + int ret = -EINVAL; + + switch (bp->info.type) { + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + case HW_BREAKPOINT_EXECUTE: + if ((!arch_check_va_in_userspace(bp->info.address, + bp->info.len)) && + bp->info.len != HW_BREAKPOINT_LEN_EXECUTE) + return ret; + break; + case HW_BREAKPOINT_WRITE: + break; + case HW_BREAKPOINT_RW: + break; + default: + return ret; + } + + switch (bp->info.len) { + case HW_BREAKPOINT_LEN_1: + align = 0; + break; + case HW_BREAKPOINT_LEN_2: + align = 1; + break; + case HW_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + if (bp->triggered) + ret = arch_store_info(bp, tsk); + + if (ret < 0) + return ret; + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (bp->info.address & align) + return -EINVAL; + + /* Check that the virtual address is in the proper range */ + if (tsk) { + if (!arch_check_va_in_userspace(bp->info.address, bp->info.len)) + return -EFAULT; + } else { + if (!arch_check_va_in_kernelspace(bp->info.address, + bp->info.len)) + return -EFAULT; + } + return 0; +} + +void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk) +{ + struct thread_struct *thread = &(tsk->thread); + struct hw_breakpoint *bp = thread->hbp[pos]; + + thread->debugreg7 &= ~dr7_masks[pos]; + if (bp) { + thread->debugreg[pos] = bp->info.address; + thread->debugreg7 |= encode_dr7(pos, bp->info.len, + bp->info.type); + } else + thread->debugreg[pos] = 0; +} + +void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *thread = &(tsk->thread); + + thread->debugreg7 = 0; + for (i = 0; i < HBP_NUM; i++) + thread->debugreg[i] = 0; +} + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct hw_breakpoint *bp; + /* The DR6 value is stored in args->err */ + unsigned long dr7, dr6 = args->err; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + /* Lazy debug register switching */ + if (!test_tsk_thread_flag(current, TIF_DEBUG)) + arch_uninstall_thread_hw_breakpoint(); + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + /* + * Find the corresponding hw_breakpoint structure and + * invoke its triggered callback. + */ + if (i >= hbp_kernel_pos) + bp = per_cpu(this_hbp_kernel[i], cpu); + else { + bp = current->thread.hbp[i]; + if (bp) + rc = NOTIFY_DONE; + } + /* + * bp can be NULL due to lazy debug register switching + * or due to the delay between updates of hbp_kernel_pos + * and this_hbp_kernel. + */ + if (!bp) + continue; + + (bp->triggered)(bp, args->regs); + } + if (dr6 & (~DR_TRAP_BITS)) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu_no_resched(); + return rc; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} -- cgit v1.2.2 From 08d68323d1f0c34452e614263b212ca556dae47f Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:08 +0530 Subject: hw-breakpoints: modifying generic debug exception to use thread-specific debug registers This patch modifies the breakpoint exception handler code to use the new abstract debug register names. [ fweisbec@gmail.com: fix conflict against kmemcheck ] [ Impact: refactor and cleanup x86 debug exception handler ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/traps.c | 69 +++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..de9913247dd0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -529,73 +529,52 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code, SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } -- cgit v1.2.2 From 1e3500666f7c5daaadadb8431a2927cdbbdb7dd4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:26 +0530 Subject: hw-breakpoints: use wrapper routines around debug registers in processor related functions This patch enables the use of wrapper routines to access the debug/breakpoint registers on cpu management. The hardcoded debug registers save and restore operations for threads breakpoints are replaced by wrappers. And now that we handle the kernel breakpoints too, we also need to handle them on cpu hotplug operations. [ Impact: adapt new hardware breakpoint api to cpu hotplug ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/smpboot.c | 3 +++ arch/x86/power/cpu_32.c | 13 +++---------- arch/x86/power/cpu_64.c | 12 +++--------- 3 files changed, 9 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d8..2b2652d205c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -326,6 +327,7 @@ notrace static void __cpuinit start_secondary(void *unused) setup_secondary_clock(); wmb(); + load_debug_registers(); cpu_idle(); } @@ -1250,6 +1252,7 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); + hw_breakpoint_disable(); } int native_cpu_disable(void) diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 519913948003..2bc3b016de90 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -13,6 +13,7 @@ #include #include #include +#include static struct saved_context saved_context; @@ -48,6 +49,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); ctxt->cr4 = read_cr4_safe(); + hw_breakpoint_disable(); } /* Needed by apm.c */ @@ -83,16 +85,7 @@ static void fix_processor_context(void) /* * Now maybe reload the debug registers */ - if (current->thread.debugreg7) { - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); - } - + load_debug_registers(); } static void __restore_processor_state(struct saved_context *ctxt) diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 1e3bdcc959ff..46866a13a93a 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -16,6 +16,7 @@ #include #include #include +#include static void fix_processor_context(void); @@ -71,6 +72,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr3 = read_cr3(); ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); + hw_breakpoint_disable(); } void save_processor_state(void) @@ -162,13 +164,5 @@ static void fix_processor_context(void) /* * Now maybe reload the debug registers */ - if (current->thread.debugreg7){ - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } + load_debug_registers(); } -- cgit v1.2.2 From 66cb5917295958652ff6ba36d83f98f2379c46b4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:55 +0530 Subject: hw-breakpoints: use the new wrapper routines to access debug registers in process/thread code This patch enables the use of abstract debug registers in process-handling routines, according to the new hardware breakpoint Api. [ Impact: adapt thread breakpoints handling code to the new breakpoint Api ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 22 ++++++---------------- arch/x86/kernel/process_32.c | 28 ++++++++++++++++++++++++++++ arch/x86/kernel/process_64.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 291527cb438a..19a686c401b5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -46,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + flush_thread_hw_breakpoint(tsk); WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } @@ -106,12 +110,8 @@ void flush_thread(void) clear_tsk_thread_flag(tsk, TIF_DEBUG); - tsk->thread.debugreg[0] = 0; - tsk->thread.debugreg[1] = 0; - tsk->thread.debugreg[2] = 0; - tsk->thread.debugreg[3] = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + flush_thread_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -193,16 +193,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index b5e4bfef4472..297ffff2ffc2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,6 +61,8 @@ #include #include #include +#include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -265,7 +267,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + if (copy_thread_hw_breakpoint(tsk, p, clone_flags)) + goto out; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); @@ -285,10 +293,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); +out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + if (err) + flush_thread_hw_breakpoint(p); clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); p->thread.ds_ctx = NULL; @@ -427,6 +438,23 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) lazy_load_gs(next->gs); percpu_write(current_task, next_p); + /* + * There's a problem with moving the arch_install_thread_hw_breakpoint() + * call before current is updated. Suppose a kernel breakpoint is + * triggered in between the two, the hw-breakpoint handler will see that + * the 'current' task does not have TIF_DEBUG flag set and will think it + * is leftover from an old task (lazy switching) and will erase it. Then + * until the next context switch, no user-breakpoints will be installed. + * + * The real problem is that it's impossible to update both current and + * physical debug registers at the same instant, so there will always be + * a window in which they disagree and a breakpoint might get triggered. + * Since we use lazy switching, we are forced to assume that a + * disagreement means that current is correct and the exception is due + * to lazy debug register switching. + */ + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) + arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5a1a1de292ec..f7b276d4b3fb 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,6 +55,8 @@ #include #include #include +#include +#include asmlinkage extern void ret_from_fork(void); @@ -248,6 +250,8 @@ void release_thread(struct task_struct *dead_task) BUG(); } } + if (unlikely(dead_task->thread.debugreg7)) + flush_thread_hw_breakpoint(dead_task); } static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) @@ -303,12 +307,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG))) + if (copy_thread_hw_breakpoint(me, p, clone_flags)) + goto out; + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -347,6 +357,9 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + if (err) + flush_thread_hw_breakpoint(p); + return err; } @@ -492,6 +505,24 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); + /* + * There's a problem with moving the arch_install_thread_hw_breakpoint() + * call before current is updated. Suppose a kernel breakpoint is + * triggered in between the two, the hw-breakpoint handler will see that + * the 'current' task does not have TIF_DEBUG flag set and will think it + * is leftover from an old task (lazy switching) and will erase it. Then + * until the next context switch, no user-breakpoints will be installed. + * + * The real problem is that it's impossible to update both current and + * physical debug registers at the same instant, so there will always be + * a window in which they disagree and a breakpoint might get triggered. + * Since we use lazy switching, we are forced to assume that a + * disagreement means that current is correct and the exception is due + * to lazy debug register switching. + */ + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) + arch_install_thread_hw_breakpoint(next_p); + return prev_p; } -- cgit v1.2.2 From da0cdc14f5f7e0faee6b2393fefed056cdb17146 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:45:03 +0530 Subject: hw-breakpoints: modify signal handling code to refrain from re-enabling HW Breakpoints This patch disables re-enabling of Hardware Breakpoint registers through the signal handling code. This is now done during from hw_breakpoint_handler(). Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/signal.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e3..f33d2e0ef095 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -800,15 +800,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* -- cgit v1.2.2 From 72f674d203cd230426437cdcf7dd6f681dad8b0d Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:45:48 +0530 Subject: hw-breakpoints: modify Ptrace routines to access breakpoint registers This patch modifies the ptrace code to use the new wrapper routines around the debug/breakpoint registers. [ Impact: adapt x86 ptrace to the new breakpoint Api ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Signed-off-by: Maneesh Soni Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 231 +++++++++++++++++++++++++++++------------------ 1 file changed, 141 insertions(+), 90 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 313be40be55a..b457f78b7dbf 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -136,11 +137,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ - return TASK_SIZE - 3; -} - #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) @@ -265,15 +261,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - return IA32_PAGE_OFFSET - 3; -#endif - return TASK_SIZE_MAX - 7; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -464,95 +451,159 @@ static int genregs_set(struct task_struct *target, } /* - * This function is trivial and will be inlined by the compiler. - * Having it separates the implementation details of debug - * registers from the interface details of ptrace. + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. */ -static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, + unsigned *type) { - switch (n) { - case 0: return child->thread.debugreg[0]; - case 1: return child->thread.debugreg[1]; - case 2: return child->thread.debugreg[2]; - case 3: return child->thread.debugreg[3]; - case 6: return child->thread.debugreg6; - case 7: return child->thread.debugreg7; - } - return 0; + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } -static int ptrace_set_debugreg(struct task_struct *child, - int n, unsigned long data) +static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) { + struct thread_struct *thread = &(current->thread); int i; - if (unlikely(n == 4 || n == 5)) - return -EIO; + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < hbp_kernel_pos; i++) + /* + * We will check bp->info.address against the address stored in + * thread's hbp structure and not debugreg[i]. This is to ensure + * that the corresponding bit for 'i' in DR7 register is enabled + */ + if (bp->info.address == thread->hbp[i]->info.address) + break; - if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) - return -EIO; + thread->debugreg6 |= (DR_TRAP0 << i); +} - switch (n) { - case 0: child->thread.debugreg[0] = data; break; - case 1: child->thread.debugreg[1] = data; break; - case 2: child->thread.debugreg[2] = data; break; - case 3: child->thread.debugreg[3] = data; break; +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long old_dr7 = thread->debugreg7; + int i, orig_ret = 0, rc = 0; + int enabled, second_pass = 0; + unsigned len, type; + struct hw_breakpoint *bp; + + data &= ~DR_CONTROL_RESERVED; +restore: + /* + * Loop through all the hardware breakpoints, making the + * appropriate changes to each. + */ + for (i = 0; i < HBP_NUM; i++) { + enabled = decode_dr7(data, i, &len, &type); + bp = thread->hbp[i]; + + if (!enabled) { + if (bp) { + /* Don't unregister the breakpoints right-away, + * unless all register_user_hw_breakpoint() + * requests have succeeded. This prevents + * any window of opportunity for debug + * register grabbing by other users. + */ + if (!second_pass) + continue; + unregister_user_hw_breakpoint(tsk, bp); + kfree(bp); + } + continue; + } + if (!bp) { + rc = -ENOMEM; + bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); + if (bp) { + bp->info.address = thread->debugreg[i]; + bp->triggered = ptrace_triggered; + bp->info.len = len; + bp->info.type = type; + rc = register_user_hw_breakpoint(tsk, bp); + if (rc) + kfree(bp); + } + } else + rc = modify_user_hw_breakpoint(tsk, bp); + if (rc) + break; + } + /* + * Make a second pass to free the remaining unused breakpoints + * or to restore the original breakpoints if an error occurred. + */ + if (!second_pass) { + second_pass = 1; + if (rc < 0) { + orig_ret = rc; + data = old_dr7; + } + goto restore; + } + return ((orig_ret < 0) ? orig_ret : rc); +} - case 6: - if ((data & ~0xffffffffUL) != 0) - return -EIO; - child->thread.debugreg6 = data; - break; +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long val = 0; + + if (n < HBP_NUM) + val = thread->debugreg[n]; + else if (n == 6) + val = thread->debugreg6; + else if (n == 7) + val = thread->debugreg7; + return val; +} - case 7: - /* - * Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System Programming) - */ -#ifdef CONFIG_X86_32 -#define DR7_MASK 0x5f54 -#else -#define DR7_MASK 0x5554 -#endif - data &= ~DR_CONTROL_RESERVED; - for (i = 0; i < 4; i++) - if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; +/* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +{ + struct thread_struct *thread = &(tsk->thread); + int rc = 0; + + /* There are no DR4 or DR5 registers */ + if (n == 4 || n == 5) + return -EIO; + + if (n == 6) { + tsk->thread.debugreg6 = val; + goto ret_path; } + if (n < HBP_NUM) { + if (thread->hbp[n]) { + if (arch_check_va_in_userspace(val, + thread->hbp[n]->info.len) == 0) { + rc = -EIO; + goto ret_path; + } + thread->hbp[n]->info.address = val; + } + thread->debugreg[n] = val; + } + /* All that's left is DR7 */ + if (n == 7) + rc = ptrace_write_dr7(tsk, val); - return 0; +ret_path: + return rc; } /* -- cgit v1.2.2 From 17f557e5b5d43a2af66c969f6560ac7105020672 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:46:03 +0530 Subject: hw-breakpoints: cleanup HW Breakpoint registers before kexec This patch disables Hardware breakpoints before doing a 'kexec' on the machine so that the cpu doesn't keep debug registers values which would be out of sync for the new image. Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/machine_kexec_32.c | 2 ++ arch/x86/kernel/machine_kexec_64.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d00130..c843f8406da2 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -25,6 +25,7 @@ #include #include #include +#include static void set_idt(void *newidt, __u16 limit) { @@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 84c3bf209e98..4a8bb82248ae 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include #include #include +#include static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) @@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC -- cgit v1.2.2 From 62edab9056a6cf0c9207339c8892c923a5217e45 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:47:06 +0530 Subject: hw-breakpoints: reset bits in dr6 after the corresponding exception is handled This patch resets the bit in dr6 after the corresponding exception is handled in code, so that we keep a clean track of the current virtual debug status register. [ Impact: keep track of breakpoints triggering completion ] Signed-off-by: K.Prasad Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/hw_breakpoint.c | 13 +++++++++++-- arch/x86/kernel/kgdb.c | 6 ++++++ arch/x86/kernel/kprobes.c | 9 ++++++++- arch/x86/kernel/traps.c | 4 ++-- arch/x86/mm/kmmio.c | 8 +++++++- 5 files changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4867c9f3b5fb..69451473dbd2 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -314,8 +314,12 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct hw_breakpoint *bp; - /* The DR6 value is stored in args->err */ - unsigned long dr7, dr6 = args->err; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) @@ -351,6 +355,11 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) if (bp) rc = NOTIFY_DONE; } + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); /* * bp can be NULL due to lazy debug register switching * or due to the delay between updates of hbp_kernel_pos diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b1f4dffb919e..f820b73c7f28 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,7 @@ #include #include +#include #include #include @@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) "resuming...\n"); kgdb_arch_handle_exception(args->trapnr, args->signr, args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..b5b1848c5336 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -54,6 +54,7 @@ #include #include #include +#include void jprobe_return_end(void); @@ -967,8 +968,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index de9913247dd0..124a4d5a95b2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -545,8 +545,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; - if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code, - SIGTRAP) == NOTIFY_STOP) + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 16ccbd77917f..11a4ad4d6253 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) struct die_args *arg = args; if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) + if (post_kmmio_handler(arg->err, arg->regs) == 1) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; return NOTIFY_STOP; + } return NOTIFY_DONE; } -- cgit v1.2.2 From 4555835b707d5c778ee1c9076670bc99b1eeaf61 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 17 Jun 2009 14:44:19 +0530 Subject: x86: hw_breakpoint.c arch_check_va_in_kernelspace and hw_breakpoint_handler should be static arch_check_va_in_kernelspace() and hw_breakpoint_handler() is used only by same file so it should be static. Also fixed non-ANSI function declaration of function 'arch_uninstall_thread_hw_breakpoint' Fixed following sparse warnings : arch/x86/kernel/hw_breakpoint.c:124:42: warning: non-ANSI function declaration of function 'arch_uninstall_thread_hw_breakpoint' arch/x86/kernel/hw_breakpoint.c:169:5: warning: symbol 'arch_check_va_in_kernelspace' was not declared. Should it be static? arch/x86/kernel/hw_breakpoint.c:313:15: warning: symbol 'hw_breakpoint_handler' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Alan Stern Cc: "K.Prasad" Cc: Frederic Weisbecker LKML-Reference: <1245230059.2662.4.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 51d959528b1d..9316a9de4de3 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -121,7 +121,7 @@ void arch_install_thread_hw_breakpoint(struct task_struct *tsk) /* * Install the debug register values for just the kernel, no thread. */ -void arch_uninstall_thread_hw_breakpoint() +void arch_uninstall_thread_hw_breakpoint(void) { /* Clear the user-space portion of debugreg7 by setting only kdr7 */ set_debugreg(kdr7, 7); @@ -166,7 +166,7 @@ int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) /* * Check for virtual address in kernel space. */ -int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) { unsigned int len; @@ -310,7 +310,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) * NOTIFY_STOP returned for all other cases * */ -int __kprobes hw_breakpoint_handler(struct die_args *args) +static int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct hw_breakpoint *bp; -- cgit v1.2.2 From 9d22b536609abf0d64648f99518676ea58245e3b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 19:52:30 +0530 Subject: x86: Mark ptrace_get_debugreg() as static This sparse warning: arch/x86/kernel/ptrace.c:560:15: warning: symbol 'ptrace_get_debugreg' was not declared. Should it be static? triggers because ptrace_get_debugreg() is global but is only used in a single .c file. change ptrace_get_debugreg() to static to fix that - this also addresses the sparse warning. Signed-off-by: Jaswinder Singh Rajput Cc: Steven Rostedt LKML-Reference: <1246458150.6940.19.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b457f78b7dbf..cabdabce3cb2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -557,7 +557,7 @@ restore: /* * Handle PTRACE_PEEKUSR calls for the debug register area. */ -unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { struct thread_struct *thread = &(tsk->thread); unsigned long val = 0; -- cgit v1.2.2 From 39fe05e58c5e448601ce46e6b03900d5bf31c4b0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Aug 2009 11:16:12 +0800 Subject: x86, hpet: Disable per-cpu hpet timer if ARAT is supported If CPU support always running local APIC timer, per-cpu hpet timer could be disabled, which is useless and wasteful in such case. Let's leave the timers to others. The effect is that we reserve less timers. Signed-off-by: Shaohua Li Cc: venkatesh.pallipadi@intel.com LKML-Reference: <20090812031612.GA10062@sli10-desk.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dedc2bddf7a5..5969e1078fc2 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -584,6 +584,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (boot_cpu_has(X86_FEATURE_ARAT)) + return; id = hpet_readl(HPET_ID); num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); @@ -872,10 +874,8 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); - hpet_msi_capability_lookup(2); return 1; } - hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -908,9 +908,17 @@ static __init int hpet_late_init(void) if (!hpet_virt_address) return -ENODEV; + if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) + hpet_msi_capability_lookup(2); + else + hpet_msi_capability_lookup(0); + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (boot_cpu_has(X86_FEATURE_ARAT)) + return 0; + for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } -- cgit v1.2.2 From 5946fa3d5cdeb846a647a1900026af9f8b08c8b5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 19 Aug 2009 08:44:24 +0100 Subject: x86, hpet: Simplify the HPET code On 64-bits, using unsigned long when unsigned int suffices needlessly creates larger code (due to the need for REX prefixes), and most of the logic in hpet.c really doesn't need 64-bit operations. At once this avoids the need for a couple of type casts. Signed-off-by: Jan Beulich Cc: Shaohua Li Cc: Venkatesh Pallipadi LKML-Reference: <4A8BC9780200007800010832@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hpet.h | 2 +- arch/x86/kernel/hpet.c | 45 +++++++++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 1c22cb05ad6a..65847c578b70 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -69,7 +69,7 @@ extern int hpet_force_user; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); -extern unsigned long hpet_readl(unsigned long a); +extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); extern void hpet_msi_unmask(unsigned int irq); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 5969e1078fc2..ba575f0f2e34 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -47,12 +47,12 @@ struct hpet_dev { char name[10]; }; -unsigned long hpet_readl(unsigned long a) +inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); } -static inline void hpet_writel(unsigned long d, unsigned long a) +static inline void hpet_writel(unsigned int d, unsigned int a) { writel(d, hpet_virt_address + a); } @@ -167,7 +167,7 @@ do { \ static void hpet_reserve_msi_timers(struct hpet_data *hd); -static void hpet_reserve_platform_timers(unsigned long id) +static void hpet_reserve_platform_timers(unsigned int id) { struct hpet __iomem *hpet = hpet_virt_address; struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; @@ -205,7 +205,7 @@ static void hpet_reserve_platform_timers(unsigned long id) } #else -static void hpet_reserve_platform_timers(unsigned long id) { } +static void hpet_reserve_platform_timers(unsigned int id) { } #endif /* @@ -246,7 +246,7 @@ static void hpet_reset_counter(void) static void hpet_start_counter(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -271,7 +271,7 @@ static void hpet_resume_counter(void) static void hpet_enable_legacy_int(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_LEGACY; hpet_writel(cfg, HPET_CFG); @@ -314,7 +314,7 @@ static int hpet_setup_msi_irq(unsigned int irq); static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { - unsigned long cfg, cmp, now; + unsigned int cfg, cmp, now; uint64_t delta; switch (mode) { @@ -323,7 +323,7 @@ static void hpet_set_mode(enum clock_event_mode mode, delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); - cmp = now + (unsigned long) delta; + cmp = now + (unsigned int) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); /* Make sure we use edge triggered interrupts */ cfg &= ~HPET_TN_LEVEL; @@ -339,7 +339,7 @@ static void hpet_set_mode(enum clock_event_mode mode, * (See AMD-8111 HyperTransport I/O Hub Data Sheet, * Publication # 24674) */ - hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); + hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer)); hpet_start_counter(); hpet_print_config(); break; @@ -387,9 +387,9 @@ static int hpet_next_event(unsigned long delta, * what we wrote hit the chip before we compare it to the * counter. */ - WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); + WARN_ON_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt); - return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; + return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -415,7 +415,7 @@ static struct hpet_dev *hpet_devs; void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); - unsigned long cfg; + unsigned int cfg; /* unmask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); @@ -425,7 +425,7 @@ void hpet_msi_unmask(unsigned int irq) void hpet_msi_mask(unsigned int irq) { - unsigned long cfg; + unsigned int cfg; struct hpet_dev *hdev = get_irq_data(irq); /* mask it */ @@ -600,7 +600,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { struct hpet_dev *hdev = &hpet_devs[num_timers_used]; - unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + unsigned int cfg = hpet_readl(HPET_Tn_CFG(i)); /* Only consider HPET timer with MSI support */ if (!(cfg & HPET_TN_FSB_CAP)) @@ -815,7 +815,7 @@ static int hpet_clocksource_register(void) */ int __init hpet_enable(void) { - unsigned long id; + unsigned int id; int i; if (!is_hpet_capable()) @@ -933,7 +933,7 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { if (is_hpet_capable()) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; @@ -973,8 +973,8 @@ static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static u32 hpet_t1_cmp; -static unsigned long hpet_default_delta; -static unsigned long hpet_pie_delta; +static u32 hpet_default_delta; +static u32 hpet_pie_delta; static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; @@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); */ int hpet_rtc_timer_init(void) { - unsigned long cfg, cnt, delta, flags; + unsigned int cfg, cnt, delta; + unsigned long flags; if (!is_hpet_enabled()) return 0; @@ -1035,7 +1036,7 @@ int hpet_rtc_timer_init(void) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; - hpet_default_delta = (unsigned long) clc; + hpet_default_delta = clc; } if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) @@ -1121,7 +1122,7 @@ int hpet_set_periodic_freq(unsigned long freq) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; do_div(clc, freq); clc >>= hpet_clockevent.shift; - hpet_pie_delta = (unsigned long) clc; + hpet_pie_delta = clc; } return 1; } @@ -1135,7 +1136,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { - unsigned long cfg, delta; + unsigned int cfg, delta; int lost_ints = -1; if (unlikely(!hpet_rtc_flags)) { -- cgit v1.2.2 From eb13296cfaf6c699566473669a96a38a90562384 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:13 -0400 Subject: x86: Instruction decoder API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add x86 instruction decoder to arch-specific libraries. This decoder can decode x86 instructions used in kernel into prefix, opcode, modrm, sib, displacement and immediates. This can also show the length of instructions. This version introduces instruction attributes for decoding instructions. The instruction attribute tables are generated from the opcode map file (x86-opcode-map.txt) by the generator script(gen-insn-attr-x86.awk). Currently, the opcode maps are based on opcode maps in Intel(R) 64 and IA-32 Architectures Software Developers Manual Vol.2: Appendix.A, and consist of below two types of opcode tables. 1-byte/2-bytes/3-bytes opcodes, which has 256 elements, are written as below; Table: table-name Referrer: escaped-name opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] (or) opcode: escape # escaped-name EndTable Group opcodes, which has 8 elements, are written as below; GrpTable: GrpXXX reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] EndTable These opcode maps include a few SSE and FP opcodes (for setup), because those opcodes are used in the kernel. Signed-off-by: Masami Hiramatsu Signed-off-by: Jim Keniston Acked-by: H. Peter Anvin Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: PrzemysÅ‚aw PaweÅ‚czyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203413.31965.49709.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/inat.h | 188 +++++++++ arch/x86/include/asm/inat_types.h | 29 ++ arch/x86/include/asm/insn.h | 143 +++++++ arch/x86/lib/Makefile | 13 + arch/x86/lib/inat.c | 78 ++++ arch/x86/lib/insn.c | 464 ++++++++++++++++++++++ arch/x86/lib/x86-opcode-map.txt | 719 +++++++++++++++++++++++++++++++++++ arch/x86/tools/gen-insn-attr-x86.awk | 314 +++++++++++++++ 8 files changed, 1948 insertions(+) create mode 100644 arch/x86/include/asm/inat.h create mode 100644 arch/x86/include/asm/inat_types.h create mode 100644 arch/x86/include/asm/insn.h create mode 100644 arch/x86/lib/inat.c create mode 100644 arch/x86/lib/insn.c create mode 100644 arch/x86/lib/x86-opcode-map.txt create mode 100644 arch/x86/tools/gen-insn-attr-x86.awk (limited to 'arch/x86') diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h new file mode 100644 index 000000000000..2866fddd1848 --- /dev/null +++ b/arch/x86/include/asm/inat.h @@ -0,0 +1,188 @@ +#ifndef _ASM_X86_INAT_H +#define _ASM_X86_INAT_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include + +/* + * Internal bits. Don't use bitmasks directly, because these bits are + * unstable. You should use checking functions. + */ + +#define INAT_OPCODE_TABLE_SIZE 256 +#define INAT_GROUP_TABLE_SIZE 8 + +/* Legacy instruction prefixes */ +#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ +#define INAT_PFX_REPNE 2 /* 0xF2 */ /* LPFX2 */ +#define INAT_PFX_REPE 3 /* 0xF3 */ /* LPFX3 */ +#define INAT_PFX_LOCK 4 /* 0xF0 */ +#define INAT_PFX_CS 5 /* 0x2E */ +#define INAT_PFX_DS 6 /* 0x3E */ +#define INAT_PFX_ES 7 /* 0x26 */ +#define INAT_PFX_FS 8 /* 0x64 */ +#define INAT_PFX_GS 9 /* 0x65 */ +#define INAT_PFX_SS 10 /* 0x36 */ +#define INAT_PFX_ADDRSZ 11 /* 0x67 */ + +#define INAT_LPREFIX_MAX 3 + +/* Immediate size */ +#define INAT_IMM_BYTE 1 +#define INAT_IMM_WORD 2 +#define INAT_IMM_DWORD 3 +#define INAT_IMM_QWORD 4 +#define INAT_IMM_PTR 5 +#define INAT_IMM_VWORD32 6 +#define INAT_IMM_VWORD 7 + +/* Legacy prefix */ +#define INAT_PFX_OFFS 0 +#define INAT_PFX_BITS 4 +#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) +#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) +/* Escape opcodes */ +#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS) +#define INAT_ESC_BITS 2 +#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1) +#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS) +/* Group opcodes (1-16) */ +#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS) +#define INAT_GRP_BITS 5 +#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1) +#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS) +/* Immediates */ +#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS) +#define INAT_IMM_BITS 3 +#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) +/* Flags */ +#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) +#define INAT_REXPFX (1 << INAT_FLAG_OFFS) +#define INAT_MODRM (1 << (INAT_FLAG_OFFS + 1)) +#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 2)) +#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 3)) +#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 4)) +#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 5)) +/* Attribute making macros for attribute tables */ +#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) +#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) +#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) +#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + +/* Attribute search APIs */ +extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); +extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, + insn_byte_t last_pfx, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, + insn_byte_t last_pfx, + insn_attr_t esc_attr); + +/* Attribute checking functions */ +static inline int inat_is_prefix(insn_attr_t attr) +{ + return attr & INAT_PFX_MASK; +} + +static inline int inat_is_address_size_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ; +} + +static inline int inat_is_operand_size_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ; +} + +static inline int inat_last_prefix_id(insn_attr_t attr) +{ + if ((attr & INAT_PFX_MASK) > INAT_LPREFIX_MAX) + return 0; + else + return attr & INAT_PFX_MASK; +} + +static inline int inat_is_escape(insn_attr_t attr) +{ + return attr & INAT_ESC_MASK; +} + +static inline int inat_escape_id(insn_attr_t attr) +{ + return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS; +} + +static inline int inat_is_group(insn_attr_t attr) +{ + return attr & INAT_GRP_MASK; +} + +static inline int inat_group_id(insn_attr_t attr) +{ + return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS; +} + +static inline int inat_group_common_attribute(insn_attr_t attr) +{ + return attr & ~INAT_GRP_MASK; +} + +static inline int inat_has_immediate(insn_attr_t attr) +{ + return attr & INAT_IMM_MASK; +} + +static inline int inat_immediate_size(insn_attr_t attr) +{ + return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS; +} + +static inline int inat_is_rex_prefix(insn_attr_t attr) +{ + return attr & INAT_REXPFX; +} + +static inline int inat_has_modrm(insn_attr_t attr) +{ + return attr & INAT_MODRM; +} + +static inline int inat_is_force64(insn_attr_t attr) +{ + return attr & INAT_FORCE64; +} + +static inline int inat_has_second_immediate(insn_attr_t attr) +{ + return attr & INAT_SCNDIMM; +} + +static inline int inat_has_moffset(insn_attr_t attr) +{ + return attr & INAT_MOFFSET; +} + +static inline int inat_has_variant(insn_attr_t attr) +{ + return attr & INAT_VARIANT; +} + +#endif diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h new file mode 100644 index 000000000000..cb3c20ce39cf --- /dev/null +++ b/arch/x86/include/asm/inat_types.h @@ -0,0 +1,29 @@ +#ifndef _ASM_X86_INAT_TYPES_H +#define _ASM_X86_INAT_TYPES_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +/* Instruction attributes */ +typedef unsigned int insn_attr_t; +typedef unsigned char insn_byte_t; +typedef signed int insn_value_t; + +#endif diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h new file mode 100644 index 000000000000..12b4e3751d3f --- /dev/null +++ b/arch/x86/include/asm/insn.h @@ -0,0 +1,143 @@ +#ifndef _ASM_X86_INSN_H +#define _ASM_X86_INSN_H +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +/* insn_attr_t is defined in inat.h */ +#include + +struct insn_field { + union { + insn_value_t value; + insn_byte_t bytes[4]; + }; + /* !0 if we've run insn_get_xxx() for this field */ + unsigned char got; + unsigned char nbytes; +}; + +struct insn { + struct insn_field prefixes; /* + * Prefixes + * prefixes.bytes[3]: last prefix + */ + struct insn_field rex_prefix; /* REX prefix */ + struct insn_field opcode; /* + * opcode.bytes[0]: opcode1 + * opcode.bytes[1]: opcode2 + * opcode.bytes[2]: opcode3 + */ + struct insn_field modrm; + struct insn_field sib; + struct insn_field displacement; + union { + struct insn_field immediate; + struct insn_field moffset1; /* for 64bit MOV */ + struct insn_field immediate1; /* for 64bit imm or off16/32 */ + }; + union { + struct insn_field moffset2; /* for 64bit MOV */ + struct insn_field immediate2; /* for 64bit imm or seg16 */ + }; + + insn_attr_t attr; + unsigned char opnd_bytes; + unsigned char addr_bytes; + unsigned char length; + unsigned char x86_64; + + const insn_byte_t *kaddr; /* kernel address of insn to analyze */ + const insn_byte_t *next_byte; +}; + +#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) +#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) +#define X86_MODRM_RM(modrm) ((modrm) & 0x07) + +#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) +#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) +#define X86_SIB_BASE(sib) ((sib) & 0x07) + +#define X86_REX_W(rex) ((rex) & 8) +#define X86_REX_R(rex) ((rex) & 4) +#define X86_REX_X(rex) ((rex) & 2) +#define X86_REX_B(rex) ((rex) & 1) + +/* The last prefix is needed for two-byte and three-byte opcodes */ +static inline insn_byte_t insn_last_prefix(struct insn *insn) +{ + return insn->prefixes.bytes[3]; +} + +extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); +extern void insn_get_prefixes(struct insn *insn); +extern void insn_get_opcode(struct insn *insn); +extern void insn_get_modrm(struct insn *insn); +extern void insn_get_sib(struct insn *insn); +extern void insn_get_displacement(struct insn *insn); +extern void insn_get_immediate(struct insn *insn); +extern void insn_get_length(struct insn *insn); + +/* Attribute will be determined after getting ModRM (for opcode groups) */ +static inline void insn_get_attribute(struct insn *insn) +{ + insn_get_modrm(insn); +} + +/* Instruction uses RIP-relative addressing */ +extern int insn_rip_relative(struct insn *insn); + +/* Init insn for kernel text */ +static inline void kernel_insn_init(struct insn *insn, const void *kaddr) +{ +#ifdef CONFIG_X86_64 + insn_init(insn, kaddr, 1); +#else /* CONFIG_X86_32 */ + insn_init(insn, kaddr, 0); +#endif +} + +/* Offset of each field from kaddr */ +static inline int insn_offset_rex_prefix(struct insn *insn) +{ + return insn->prefixes.nbytes; +} +static inline int insn_offset_opcode(struct insn *insn) +{ + return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; +} +static inline int insn_offset_modrm(struct insn *insn) +{ + return insn_offset_opcode(insn) + insn->opcode.nbytes; +} +static inline int insn_offset_sib(struct insn *insn) +{ + return insn_offset_modrm(insn) + insn->modrm.nbytes; +} +static inline int insn_offset_displacement(struct insn *insn) +{ + return insn_offset_sib(insn) + insn->sib.nbytes; +} +static inline int insn_offset_immediate(struct insn *insn) +{ + return insn_offset_displacement(insn) + insn->displacement.nbytes; +} + +#endif /* _ASM_X86_INSN_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 07c31899c9c2..c77f8a7c531d 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,12 +2,25 @@ # Makefile for x86 specific library files. # +inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +$(obj)/inat.o: $(obj)/inat-tables.c + +clean-files := inat-tables.c + obj-$(CONFIG_SMP) := msr.o lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += insn.o inat.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c new file mode 100644 index 000000000000..054656a01dfd --- /dev/null +++ b/arch/x86/lib/inat.c @@ -0,0 +1,78 @@ +/* + * x86 instruction attribute tables + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include + +/* Attribute tables are generated from opcode map */ +#include "inat-tables.c" + +/* Attribute search APIs */ +insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) +{ + return inat_primary_table[opcode]; +} + +insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx, + insn_attr_t esc_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = inat_escape_id(esc_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = inat_last_prefix_id(lpfx_attr); + } + table = inat_escape_tables[n][0]; + if (!table) + return 0; + if (inat_has_variant(table[opcode]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return 0; + } + return table[opcode]; +} + +insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx, + insn_attr_t grp_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = inat_group_id(grp_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = inat_last_prefix_id(lpfx_attr); + } + table = inat_group_tables[n][0]; + if (!table) + return inat_group_common_attribute(grp_attr); + if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return inat_group_common_attribute(grp_attr); + } + return table[X86_MODRM_REG(modrm)] | + inat_group_common_attribute(grp_attr); +} + diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c new file mode 100644 index 000000000000..dfd56a30053f --- /dev/null +++ b/arch/x86/lib/insn.c @@ -0,0 +1,464 @@ +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004, 2009 + */ + +#include +#include +#include + +#define get_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define peek_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; r; }) + +/** + * insn_init() - initialize struct insn + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @x86_64: !0 for 64-bit kernel or 64-bit app + */ +void insn_init(struct insn *insn, const void *kaddr, int x86_64) +{ + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->next_byte = kaddr; + insn->x86_64 = x86_64 ? 1 : 0; + insn->opnd_bytes = 4; + if (x86_64) + insn->addr_bytes = 8; + else + insn->addr_bytes = 4; +} + +/** + * insn_get_prefixes - scan x86 instruction prefix bytes + * @insn: &struct insn containing instruction + * + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte + * to point to the (first) opcode. No effect if @insn->prefixes.got + * is already set. + */ +void insn_get_prefixes(struct insn *insn) +{ + struct insn_field *prefixes = &insn->prefixes; + insn_attr_t attr; + insn_byte_t b, lb; + int i, nb; + + if (prefixes->got) + return; + + nb = 0; + lb = 0; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + while (inat_is_prefix(attr)) { + /* Skip if same prefix */ + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == b) + goto found; + if (nb == 4) + /* Invalid instruction */ + break; + prefixes->bytes[nb++] = b; + if (inat_is_address_size_prefix(attr)) { + /* address size switches 2/4 or 4/8 */ + if (insn->x86_64) + insn->addr_bytes ^= 12; + else + insn->addr_bytes ^= 6; + } else if (inat_is_operand_size_prefix(attr)) { + /* oprand size switches 2/4 */ + insn->opnd_bytes ^= 6; + } +found: + prefixes->nbytes++; + insn->next_byte++; + lb = b; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + } + /* Set the last prefix */ + if (lb && lb != insn->prefixes.bytes[3]) { + if (unlikely(insn->prefixes.bytes[3])) { + /* Swap the last prefix */ + b = insn->prefixes.bytes[3]; + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == lb) + prefixes->bytes[i] = b; + } + insn->prefixes.bytes[3] = lb; + } + + if (insn->x86_64) { + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_rex_prefix(attr)) { + insn->rex_prefix.value = b; + insn->rex_prefix.nbytes = 1; + insn->next_byte++; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } + } + insn->rex_prefix.got = 1; + prefixes->got = 1; + return; +} + +/** + * insn_get_opcode - collect opcode(s) + * @insn: &struct insn containing instruction + * + * Populates @insn->opcode, updates @insn->next_byte to point past the + * opcode byte(s), and set @insn->attr (except for groups). + * If necessary, first collects any preceding (prefix) bytes. + * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got + * is already 1. + */ +void insn_get_opcode(struct insn *insn) +{ + struct insn_field *opcode = &insn->opcode; + insn_byte_t op, pfx; + if (opcode->got) + return; + if (!insn->prefixes.got) + insn_get_prefixes(insn); + + /* Get first opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[0] = op; + opcode->nbytes = 1; + insn->attr = inat_get_opcode_attribute(op); + while (inat_is_escape(insn->attr)) { + /* Get escaped opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[opcode->nbytes++] = op; + pfx = insn_last_prefix(insn); + insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); + } + opcode->got = 1; +} + +/** + * insn_get_modrm - collect ModRM byte, if any + * @insn: &struct insn containing instruction + * + * Populates @insn->modrm and updates @insn->next_byte to point past the + * ModRM byte, if any. If necessary, first collects the preceding bytes + * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + */ +void insn_get_modrm(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + insn_byte_t pfx, mod; + if (modrm->got) + return; + if (!insn->opcode.got) + insn_get_opcode(insn); + + if (inat_has_modrm(insn->attr)) { + mod = get_next(insn_byte_t, insn); + modrm->value = mod; + modrm->nbytes = 1; + if (inat_is_group(insn->attr)) { + pfx = insn_last_prefix(insn); + insn->attr = inat_get_group_attribute(mod, pfx, + insn->attr); + } + } + + if (insn->x86_64 && inat_is_force64(insn->attr)) + insn->opnd_bytes = 8; + modrm->got = 1; +} + + +/** + * insn_rip_relative() - Does instruction use RIP-relative addressing mode? + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. No effect if @insn->x86_64 is 0. + */ +int insn_rip_relative(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + + if (!insn->x86_64) + return 0; + if (!modrm->got) + insn_get_modrm(insn); + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); +} + +/** + * insn_get_sib() - Get the SIB byte of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. + */ +void insn_get_sib(struct insn *insn) +{ + insn_byte_t modrm; + + if (insn->sib.got) + return; + if (!insn->modrm.got) + insn_get_modrm(insn); + if (insn->modrm.nbytes) { + modrm = (insn_byte_t)insn->modrm.value; + if (insn->addr_bytes != 2 && + X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { + insn->sib.value = get_next(insn_byte_t, insn); + insn->sib.nbytes = 1; + } + } + insn->sib.got = 1; +} + + +/** + * insn_get_displacement() - Get the displacement of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * SIB byte. + * Displacement value is sign-expanded. + */ +void insn_get_displacement(struct insn *insn) +{ + insn_byte_t mod, rm, base; + + if (insn->displacement.got) + return; + if (!insn->sib.got) + insn_get_sib(insn); + if (insn->modrm.nbytes) { + /* + * Interpreting the modrm byte: + * mod = 00 - no displacement fields (exceptions below) + * mod = 01 - 1-byte displacement field + * mod = 10 - displacement field is 4 bytes, or 2 bytes if + * address size = 2 (0x67 prefix in 32-bit mode) + * mod = 11 - no memory operand + * + * If address size = 2... + * mod = 00, r/m = 110 - displacement field is 2 bytes + * + * If address size != 2... + * mod != 11, r/m = 100 - SIB byte exists + * mod = 00, SIB base = 101 - displacement field is 4 bytes + * mod = 00, r/m = 101 - rip-relative addressing, displacement + * field is 4 bytes + */ + mod = X86_MODRM_MOD(insn->modrm.value); + rm = X86_MODRM_RM(insn->modrm.value); + base = X86_SIB_BASE(insn->sib.value); + if (mod == 3) + goto out; + if (mod == 1) { + insn->displacement.value = get_next(char, insn); + insn->displacement.nbytes = 1; + } else if (insn->addr_bytes == 2) { + if ((mod == 0 && rm == 6) || mod == 2) { + insn->displacement.value = + get_next(short, insn); + insn->displacement.nbytes = 2; + } + } else { + if ((mod == 0 && rm == 5) || mod == 2 || + (mod == 0 && base == 5)) { + insn->displacement.value = get_next(int, insn); + insn->displacement.nbytes = 4; + } + } + } +out: + insn->displacement.got = 1; +} + +/* Decode moffset16/32/64 */ +static void __get_moffset(struct insn *insn) +{ + switch (insn->addr_bytes) { + case 2: + insn->moffset1.value = get_next(short, insn); + insn->moffset1.nbytes = 2; + break; + case 4: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + break; + case 8: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + insn->moffset2.value = get_next(int, insn); + insn->moffset2.nbytes = 4; + break; + } + insn->moffset1.got = insn->moffset2.got = 1; +} + +/* Decode imm v32(Iz) */ +static void __get_immv32(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case 4: + case 8: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + } +} + +/* Decode imm v64(Iv/Ov) */ +static void __get_immv(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + } + insn->immediate1.got = insn->immediate2.got = 1; +} + +/* Decode ptr16:16/32(Ap) */ +static void __get_immptr(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + /* ptr16:64 is not exist (no segment) */ + return; + } + insn->immediate2.value = get_next(unsigned short, insn); + insn->immediate2.nbytes = 2; + insn->immediate1.got = insn->immediate2.got = 1; +} + +/** + * insn_get_immediate() - Get the immediates of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * displacement bytes. + * Basically, most of immediates are sign-expanded. Unsigned-value can be + * get by bit masking with ((1 << (nbytes * 8)) - 1) + */ +void insn_get_immediate(struct insn *insn) +{ + if (insn->immediate.got) + return; + if (!insn->displacement.got) + insn_get_displacement(insn); + + if (inat_has_moffset(insn->attr)) { + __get_moffset(insn); + goto done; + } + + if (!inat_has_immediate(insn->attr)) + /* no immediates */ + goto done; + + switch (inat_immediate_size(insn->attr)) { + case INAT_IMM_BYTE: + insn->immediate.value = get_next(char, insn); + insn->immediate.nbytes = 1; + break; + case INAT_IMM_WORD: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case INAT_IMM_DWORD: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + case INAT_IMM_QWORD: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + case INAT_IMM_PTR: + __get_immptr(insn); + break; + case INAT_IMM_VWORD32: + __get_immv32(insn); + break; + case INAT_IMM_VWORD: + __get_immv(insn); + break; + default: + break; + } + if (inat_has_second_immediate(insn->attr)) { + insn->immediate2.value = get_next(char, insn); + insn->immediate2.nbytes = 1; + } +done: + insn->immediate.got = 1; +} + +/** + * insn_get_length() - Get the length of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * immediates bytes. + */ +void insn_get_length(struct insn *insn) +{ + if (insn->length) + return; + if (!insn->immediate.got) + insn_get_immediate(insn); + insn->length = (unsigned char)((unsigned long)insn->next_byte + - (unsigned long)insn->kaddr); +} diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt new file mode 100644 index 000000000000..083dd59dd74b --- /dev/null +++ b/arch/x86/lib/x86-opcode-map.txt @@ -0,0 +1,719 @@ +# x86 Opcode Maps +# +# +# Table: table-name +# Referrer: escaped-name +# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# (or) +# opcode: escape # escaped-name +# EndTable +# +# +# GrpTable: GrpXXX +# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# EndTable +# + +Table: one byte opcode +Referrer: +# 0x00 - 0x0f +00: ADD Eb,Gb +01: ADD Ev,Gv +02: ADD Gb,Eb +03: ADD Gv,Ev +04: ADD AL,Ib +05: ADD rAX,Iz +06: PUSH ES (i64) +07: POP ES (i64) +08: OR Eb,Gb +09: OR Ev,Gv +0a: OR Gb,Eb +0b: OR Gv,Ev +0c: OR AL,Ib +0d: OR rAX,Iz +0e: PUSH CS (i64) +0f: escape # 2-byte escape +# 0x10 - 0x1f +10: ADC Eb,Gb +11: ADC Ev,Gv +12: ADC Gb,Eb +13: ADC Gv,Ev +14: ADC AL,Ib +15: ADC rAX,Iz +16: PUSH SS (i64) +17: POP SS (i64) +18: SBB Eb,Gb +19: SBB Ev,Gv +1a: SBB Gb,Eb +1b: SBB Gv,Ev +1c: SBB AL,Ib +1d: SBB rAX,Iz +1e: PUSH DS (i64) +1f: POP DS (i64) +# 0x20 - 0x2f +20: AND Eb,Gb +21: AND Ev,Gv +22: AND Gb,Eb +23: AND Gv,Ev +24: AND AL,Ib +25: AND rAx,Iz +26: SEG=ES (Prefix) +27: DAA (i64) +28: SUB Eb,Gb +29: SUB Ev,Gv +2a: SUB Gb,Eb +2b: SUB Gv,Ev +2c: SUB AL,Ib +2d: SUB rAX,Iz +2e: SEG=CS (Prefix) +2f: DAS (i64) +# 0x30 - 0x3f +30: XOR Eb,Gb +31: XOR Ev,Gv +32: XOR Gb,Eb +33: XOR Gv,Ev +34: XOR AL,Ib +35: XOR rAX,Iz +36: SEG=SS (Prefix) +37: AAA (i64) +38: CMP Eb,Gb +39: CMP Ev,Gv +3a: CMP Gb,Eb +3b: CMP Gv,Ev +3c: CMP AL,Ib +3d: CMP rAX,Iz +3e: SEG=DS (Prefix) +3f: AAS (i64) +# 0x40 - 0x4f +40: INC eAX (i64) | REX (o64) +41: INC eCX (i64) | REX.B (o64) +42: INC eDX (i64) | REX.X (o64) +43: INC eBX (i64) | REX.XB (o64) +44: INC eSP (i64) | REX.R (o64) +45: INC eBP (i64) | REX.RB (o64) +46: INC eSI (i64) | REX.RX (o64) +47: INC eDI (i64) | REX.RXB (o64) +48: DEC eAX (i64) | REX.W (o64) +49: DEC eCX (i64) | REX.WB (o64) +4a: DEC eDX (i64) | REX.WX (o64) +4b: DEC eBX (i64) | REX.WXB (o64) +4c: DEC eSP (i64) | REX.WR (o64) +4d: DEC eBP (i64) | REX.WRB (o64) +4e: DEC eSI (i64) | REX.WRX (o64) +4f: DEC eDI (i64) | REX.WRXB (o64) +# 0x50 - 0x5f +50: PUSH rAX/r8 (d64) +51: PUSH rCX/r9 (d64) +52: PUSH rDX/r10 (d64) +53: PUSH rBX/r11 (d64) +54: PUSH rSP/r12 (d64) +55: PUSH rBP/r13 (d64) +56: PUSH rSI/r14 (d64) +57: PUSH rDI/r15 (d64) +58: POP rAX/r8 (d64) +59: POP rCX/r9 (d64) +5a: POP rDX/r10 (d64) +5b: POP rBX/r11 (d64) +5c: POP rSP/r12 (d64) +5d: POP rBP/r13 (d64) +5e: POP rSI/r14 (d64) +5f: POP rDI/r15 (d64) +# 0x60 - 0x6f +60: PUSHA/PUSHAD (i64) +61: POPA/POPAD (i64) +62: BOUND Gv,Ma (i64) +63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) +64: SEG=FS (Prefix) +65: SEG=GS (Prefix) +66: Operand-Size (Prefix) +67: Address-Size (Prefix) +68: PUSH Iz (d64) +69: IMUL Gv,Ev,Iz +6a: PUSH Ib (d64) +6b: IMUL Gv,Ev,Ib +6c: INS/INSB Yb,DX +6d: INS/INSW/INSD Yz,DX +6e: OUTS/OUTSB DX,Xb +6f: OUTS/OUTSW/OUTSD DX,Xz +# 0x70 - 0x7f +70: JO Jb +71: JNO Jb +72: JB/JNAE/JC Jb +73: JNB/JAE/JNC Jb +74: JZ/JE Jb +75: JNZ/JNE Jb +76: JBE/JNA Jb +77: JNBE/JA Jb +78: JS Jb +79: JNS Jb +7a: JP/JPE Jb +7b: JNP/JPO Jb +7c: JL/JNGE Jb +7d: JNL/JGE Jb +7e: JLE/JNG Jb +7f: JNLE/JG Jb +# 0x80 - 0x8f +80: Grp1 Eb,Ib (1A) +81: Grp1 Ev,Iz (1A) +82: Grp1 Eb,Ib (1A),(i64) +83: Grp1 Ev,Ib (1A) +84: TEST Eb,Gb +85: TEST Ev,Gv +86: XCHG Eb,Gb +87: XCHG Ev,Gv +88: MOV Eb,Gb +89: MOV Ev,Gv +8a: MOV Gb,Eb +8b: MOV Gv,Ev +8c: MOV Ev,Sw +8d: LEA Gv,M +8e: MOV Sw,Ew +8f: Grp1A (1A) | POP Ev (d64) +# 0x90 - 0x9f +90: NOP | PAUSE (F3) | XCHG r8,rAX +91: XCHG rCX/r9,rAX +92: XCHG rDX/r10,rAX +93: XCHG rBX/r11,rAX +94: XCHG rSP/r12,rAX +95: XCHG rBP/r13,rAX +96: XCHG rSI/r14,rAX +97: XCHG rDI/r15,rAX +98: CBW/CWDE/CDQE +99: CWD/CDQ/CQO +9a: CALLF Ap (i64) +9b: FWAIT/WAIT +9c: PUSHF/D/Q Fv (d64) +9d: POPF/D/Q Fv (d64) +9e: SAHF +9f: LAHF +# 0xa0 - 0xaf +a0: MOV AL,Ob +a1: MOV rAX,Ov +a2: MOV Ob,AL +a3: MOV Ov,rAX +a4: MOVS/B Xb,Yb +a5: MOVS/W/D/Q Xv,Yv +a6: CMPS/B Xb,Yb +a7: CMPS/W/D Xv,Yv +a8: TEST AL,Ib +a9: TEST rAX,Iz +aa: STOS/B Yb,AL +ab: STOS/W/D/Q Yv,rAX +ac: LODS/B AL,Xb +ad: LODS/W/D/Q rAX,Xv +ae: SCAS/B AL,Yb +af: SCAS/W/D/Q rAX,Xv +# 0xb0 - 0xbf +b0: MOV AL/R8L,Ib +b1: MOV CL/R9L,Ib +b2: MOV DL/R10L,Ib +b3: MOV BL/R11L,Ib +b4: MOV AH/R12L,Ib +b5: MOV CH/R13L,Ib +b6: MOV DH/R14L,Ib +b7: MOV BH/R15L,Ib +b8: MOV rAX/r8,Iv +b9: MOV rCX/r9,Iv +ba: MOV rDX/r10,Iv +bb: MOV rBX/r11,Iv +bc: MOV rSP/r12,Iv +bd: MOV rBP/r13,Iv +be: MOV rSI/r14,Iv +bf: MOV rDI/r15,Iv +# 0xc0 - 0xcf +c0: Grp2 Eb,Ib (1A) +c1: Grp2 Ev,Ib (1A) +c2: RETN Iw (f64) +c3: RETN +c4: LES Gz,Mp (i64) +c5: LDS Gz,Mp (i64) +c6: Grp11 Eb,Ib (1A) +c7: Grp11 Ev,Iz (1A) +c8: ENTER Iw,Ib +c9: LEAVE (d64) +ca: RETF Iw +cb: RETF +cc: INT3 +cd: INT Ib +ce: INTO (i64) +cf: IRET/D/Q +# 0xd0 - 0xdf +d0: Grp2 Eb,1 (1A) +d1: Grp2 Ev,1 (1A) +d2: Grp2 Eb,CL (1A) +d3: Grp2 Ev,CL (1A) +d4: AAM Ib (i64) +d5: AAD Ib (i64) +d6: +d7: XLAT/XLATB +d8: ESC +d9: ESC +da: ESC +db: ESC +dc: ESC +dd: ESC +de: ESC +df: ESC +# 0xe0 - 0xef +e0: LOOPNE/LOOPNZ Jb (f64) +e1: LOOPE/LOOPZ Jb (f64) +e2: LOOP Jb (f64) +e3: JrCXZ Jb (f64) +e4: IN AL,Ib +e5: IN eAX,Ib +e6: OUT Ib,AL +e7: OUT Ib,eAX +e8: CALL Jz (f64) +e9: JMP-near Jz (f64) +ea: JMP-far Ap (i64) +eb: JMP-short Jb (f64) +ec: IN AL,DX +ed: IN eAX,DX +ee: OUT DX,AL +ef: OUT DX,eAX +# 0xf0 - 0xff +f0: LOCK (Prefix) +f1: +f2: REPNE (Prefix) +f3: REP/REPE (Prefix) +f4: HLT +f5: CMC +f6: Grp3_1 Eb (1A) +f7: Grp3_2 Ev (1A) +f8: CLC +f9: STC +fa: CLI +fb: STI +fc: CLD +fd: STD +fe: Grp4 (1A) +ff: Grp5 (1A) +EndTable + +Table: 2-byte opcode # First Byte is 0x0f +Referrer: 2-byte escape +# 0x0f 0x00-0x0f +00: Grp6 (1A) +01: Grp7 (1A) +02: LAR Gv,Ew +03: LSL Gv,Ew +04: +05: SYSCALL (o64) +06: CLTS +07: SYSRET (o64) +08: INVD +09: WBINVD +0a: +0b: UD2 (1B) +0c: +0d: NOP Ev +0e: +0f: +# 0x0f 0x10-0x1f +10: +11: +12: +13: +14: +15: +16: +17: +18: Grp16 (1A) +19: +1a: +1b: +1c: +1d: +1e: +1f: NOP Ev +# 0x0f 0x20-0x2f +20: MOV Rd,Cd +21: MOV Rd,Dd +22: MOV Cd,Rd +23: MOV Dd,Rd +24: +25: +26: +27: +28: movaps Vps,Wps | movapd Vpd,Wpd (66) +29: movaps Wps,Vps | movapd Wpd,Vpd (66) +2a: +2b: +2c: +2d: +2e: +2f: +# 0x0f 0x30-0x3f +30: WRMSR +31: RDTSC +32: RDMSR +33: RDPMC +34: SYSENTER +35: SYSEXIT +36: +37: GETSEC +38: escape # 3-byte escape 1 +39: +3a: escape # 3-byte escape 2 +3b: +3c: +3d: +3e: +3f: +# 0x0f 0x40-0x4f +40: CMOVO Gv,Ev +41: CMOVNO Gv,Ev +42: CMOVB/C/NAE Gv,Ev +43: CMOVAE/NB/NC Gv,Ev +44: CMOVE/Z Gv,Ev +45: CMOVNE/NZ Gv,Ev +46: CMOVBE/NA Gv,Ev +47: CMOVA/NBE Gv,Ev +48: CMOVS Gv,Ev +49: CMOVNS Gv,Ev +4a: CMOVP/PE Gv,Ev +4b: CMOVNP/PO Gv,Ev +4c: CMOVL/NGE Gv,Ev +4d: CMOVNL/GE Gv,Ev +4e: CMOVLE/NG Gv,Ev +4f: CMOVNLE/G Gv,Ev +# 0x0f 0x50-0x5f +50: +51: +52: +53: +54: +55: +56: +57: +58: +59: +5a: +5b: +5c: +5d: +5e: +5f: +# 0x0f 0x60-0x6f +60: +61: +62: +63: +64: +65: +66: +67: +68: +69: +6a: +6b: +6c: +6d: +6e: +6f: +# 0x0f 0x70-0x7f +70: +71: Grp12 (1A) +72: Grp13 (1A) +73: Grp14 (1A) +74: +75: +76: +77: +78: VMREAD Ed/q,Gd/q +79: VMWRITE Gd/q,Ed/q +7a: +7b: +7c: +7d: +7e: +7f: +# 0x0f 0x80-0x8f +80: JO Jz (f64) +81: JNO Jz (f64) +82: JB/JNAE/JC Jz (f64) +83: JNB/JAE/JNC Jz (f64) +84: JZ/JE Jz (f64) +85: JNZ/JNE Jz (f64) +86: JBE/JNA Jz (f64) +87: JNBE/JA Jz (f64) +88: JS Jz (f64) +89: JNS Jz (f64) +8a: JP/JPE Jz (f64) +8b: JNP/JPO Jz (f64) +8c: JL/JNGE Jz (f64) +8d: JNL/JGE Jz (f64) +8e: JLE/JNG Jz (f64) +8f: JNLE/JG Jz (f64) +# 0x0f 0x90-0x9f +90: SETO Eb +91: SETNO Eb +92: SETB/C/NAE Eb +93: SETAE/NB/NC Eb +94: SETE/Z Eb +95: SETNE/NZ Eb +96: SETBE/NA Eb +97: SETA/NBE Eb +98: SETS Eb +99: SETNS Eb +9a: SETP/PE Eb +9b: SETNP/PO Eb +9c: SETL/NGE Eb +9d: SETNL/GE Eb +9e: SETLE/NG Eb +9f: SETNLE/G Eb +# 0x0f 0xa0-0xaf +a0: PUSH FS (d64) +a1: POP FS (d64) +a2: CPUID +a3: BT Ev,Gv +a4: SHLD Ev,Gv,Ib +a5: SHLD Ev,Gv,CL +a6: +a7: GrpRNG +a8: PUSH GS (d64) +a9: POP GS (d64) +aa: RSM +ab: BTS Ev,Gv +ac: SHRD Ev,Gv,Ib +ad: SHRD Ev,Gv,CL +ae: Grp15 (1A),(1C) +af: IMUL Gv,Ev +# 0x0f 0xb0-0xbf +b0: CMPXCHG Eb,Gb +b1: CMPXCHG Ev,Gv +b2: LSS Gv,Mp +b3: BTR Ev,Gv +b4: LFS Gv,Mp +b5: LGS Gv,Mp +b6: MOVZX Gv,Eb +b7: MOVZX Gv,Ew +b8: JMPE | POPCNT Gv,Ev (F3) +b9: Grp10 (1A) +ba: Grp8 Ev,Ib (1A) +bb: BTC Ev,Gv +bc: BSF Gv,Ev +bd: BSR Gv,Ev +be: MOVSX Gv,Eb +bf: MOVSX Gv,Ew +# 0x0f 0xc0-0xcf +c0: XADD Eb,Gb +c1: XADD Ev,Gv +c2: +c3: movnti Md/q,Gd/q +c4: +c5: +c6: +c7: Grp9 (1A) +c8: BSWAP RAX/EAX/R8/R8D +c9: BSWAP RCX/ECX/R9/R9D +ca: BSWAP RDX/EDX/R10/R10D +cb: BSWAP RBX/EBX/R11/R11D +cc: BSWAP RSP/ESP/R12/R12D +cd: BSWAP RBP/EBP/R13/R13D +ce: BSWAP RSI/ESI/R14/R14D +cf: BSWAP RDI/EDI/R15/R15D +# 0x0f 0xd0-0xdf +d0: +d1: +d2: +d3: +d4: +d5: +d6: +d7: +d8: +d9: +da: +db: +dc: +dd: +de: +df: +# 0x0f 0xe0-0xef +e0: +e1: +e2: +e3: +e4: +e5: +e6: +e7: +e8: +e9: +ea: +eb: +ec: +ed: +ee: +ef: +# 0x0f 0xf0-0xff +f0: +f1: +f2: +f3: +f4: +f5: +f6: +f7: +f8: +f9: +fa: +fb: +fc: +fd: +fe: +ff: +EndTable + +Table: 3-byte opcode 1 +Referrer: 3-byte escape 1 +80: INVEPT Gd/q,Mdq (66) +81: INVPID Gd/q,Mdq (66) +f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) +f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) +EndTable + +Table: 3-byte opcode 2 +Referrer: 3-byte escape 2 +# all opcode is for SSE +EndTable + +GrpTable: Grp1 +0: ADD +1: OR +2: ADC +3: SBB +4: AND +5: SUB +6: XOR +7: CMP +EndTable + +GrpTable: Grp1A +0: POP +EndTable + +GrpTable: Grp2 +0: ROL +1: ROR +2: RCL +3: RCR +4: SHL/SAL +5: SHR +6: +7: SAR +EndTable + +GrpTable: Grp3_1 +0: TEST Eb,Ib +1: +2: NOT Eb +3: NEG Eb +4: MUL AL,Eb +5: IMUL AL,Eb +6: DIV AL,Eb +7: IDIV AL,Eb +EndTable + +GrpTable: Grp3_2 +0: TEST Ev,Iz +1: +2: NOT Ev +3: NEG Ev +4: MUL rAX,Ev +5: IMUL rAX,Ev +6: DIV rAX,Ev +7: IDIV rAX,Ev +EndTable + +GrpTable: Grp4 +0: INC Eb +1: DEC Eb +EndTable + +GrpTable: Grp5 +0: INC Ev +1: DEC Ev +2: CALLN Ev (f64) +3: CALLF Ep +4: JMPN Ev (f64) +5: JMPF Ep +6: PUSH Ev (d64) +7: +EndTable + +GrpTable: Grp6 +0: SLDT Rv/Mw +1: STR Rv/Mw +2: LLDT Ew +3: LTR Ew +4: VERR Ew +5: VERW Ew +EndTable + +GrpTable: Grp7 +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) +3: LIDT Ms +4: SMSW Mw/Rv +5: +6: LMSW Ew +7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) +EndTable + +GrpTable: Grp8 +4: BT +5: BTS +6: BTR +7: BTC +EndTable + +GrpTable: Grp9 +1: CMPXCHG8B/16B Mq/Mdq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) +7: VMPTRST Mq +EndTable + +GrpTable: Grp10 +EndTable + +GrpTable: Grp11 +0: MOV +EndTable + +GrpTable: Grp12 +EndTable + +GrpTable: Grp13 +EndTable + +GrpTable: Grp14 +EndTable + +GrpTable: Grp15 +0: fxsave +1: fxstor +2: ldmxcsr +3: stmxcsr +4: XSAVE +5: XRSTOR | lfence (11B) +6: mfence (11B) +7: clflush | sfence (11B) +EndTable + +GrpTable: Grp16 +0: prefetch NTA +1: prefetch T0 +2: prefetch T1 +3: prefetch T2 +EndTable + +GrpTable: GrpRNG +0: xstore-rng +1: xcrypt-ecb +2: xcrypt-cbc +4: xcrypt-cfb +5: xcrypt-ofb +EndTable diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk new file mode 100644 index 000000000000..93b62c92d044 --- /dev/null +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -0,0 +1,314 @@ +#!/bin/awk -f +# gen-insn-attr-x86.awk: Instruction attribute table generator +# Written by Masami Hiramatsu +# +# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c + +BEGIN { + print "/* x86 opcode map generated from x86-opcode-map.txt */" + print "/* Do not change this code. */" + ggid = 1 + geid = 1 + + opnd_expr = "^[[:alpha:]]" + ext_expr = "^\\(" + sep_expr = "^\\|$" + group_expr = "^Grp[[:alnum:]]+" + + imm_expr = "^[IJAO][[:lower:]]" + imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" + imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)" + imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)" + imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)" + imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" + imm_flag["Ob"] = "INAT_MOFFSET" + imm_flag["Ov"] = "INAT_MOFFSET" + + modrm_expr = "^([CDEGMNPQRSUVW][[:lower:]]+|NTA|T[012])" + force64_expr = "\\([df]64\\)" + rex_expr = "^REX(\\.[XRWB]+)*" + fpu_expr = "^ESC" # TODO + + lprefix1_expr = "\\(66\\)" + delete lptable1 + lprefix2_expr = "\\(F2\\)" + delete lptable2 + lprefix3_expr = "\\(F3\\)" + delete lptable3 + max_lprefix = 4 + + prefix_expr = "\\(Prefix\\)" + prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" + prefix_num["REPNE"] = "INAT_PFX_REPNE" + prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["LOCK"] = "INAT_PFX_LOCK" + prefix_num["SEG=CS"] = "INAT_PFX_CS" + prefix_num["SEG=DS"] = "INAT_PFX_DS" + prefix_num["SEG=ES"] = "INAT_PFX_ES" + prefix_num["SEG=FS"] = "INAT_PFX_FS" + prefix_num["SEG=GS"] = "INAT_PFX_GS" + prefix_num["SEG=SS"] = "INAT_PFX_SS" + prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" + + delete table + delete etable + delete gtable + eid = -1 + gid = -1 +} + +function semantic_error(msg) { + print "Semantic error at " NR ": " msg > "/dev/stderr" + exit 1 +} + +function debug(msg) { + print "DEBUG: " msg +} + +function array_size(arr, i,c) { + c = 0 + for (i in arr) + c++ + return c +} + +/^Table:/ { + print "/* " $0 " */" +} + +/^Referrer:/ { + if (NF == 1) { + # primary opcode table + tname = "inat_primary_table" + eid = -1 + } else { + # escape opcode table + ref = "" + for (i = 2; i <= NF; i++) + ref = ref $i + eid = escape[ref] + tname = sprintf("inat_escape_table_%d", eid) + } +} + +/^GrpTable:/ { + print "/* " $0 " */" + if (!($2 in group)) + semantic_error("No group: " $2 ) + gid = group[$2] + tname = "inat_group_table_" gid +} + +function print_table(tbl,name,fmt,n) +{ + print "const insn_attr_t " name " = {" + for (i = 0; i < n; i++) { + id = sprintf(fmt, i) + if (tbl[id]) + print " [" id "] = " tbl[id] "," + } + print "};" +} + +/^EndTable/ { + if (gid != -1) { + # print group tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,3] = tname "_3" + } + } else { + # print primary/escaped tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,3] = tname "_3" + } + } + print "" + delete table + delete lptable1 + delete lptable2 + delete lptable3 + gid = -1 + eid = -1 +} + +function add_flags(old,new) { + if (old && new) + return old " | " new + else if (old) + return old + else + return new +} + +# convert operands to flags. +function convert_operands(opnd, i,imm,mod) +{ + imm = null + mod = null + for (i in opnd) { + i = opnd[i] + if (match(i, imm_expr) == 1) { + if (!imm_flag[i]) + semantic_error("Unknown imm opnd: " i) + if (imm) { + if (i != "Ib") + semantic_error("Second IMM error") + imm = add_flags(imm, "INAT_SCNDIMM") + } else + imm = imm_flag[i] + } else if (match(i, modrm_expr)) + mod = "INAT_MODRM" + } + return add_flags(imm, mod) +} + +/^[0-9a-f]+\:/ { + if (NR == 1) + next + # get index + idx = "0x" substr($1, 1, index($1,":") - 1) + if (idx in table) + semantic_error("Redefine " idx " in " tname) + + # check if escaped opcode + if ("escape" == $2) { + if ($3 != "#") + semantic_error("No escaped name") + ref = "" + for (i = 4; i <= NF; i++) + ref = ref $i + if (ref in escape) + semantic_error("Redefine escape (" ref ")") + escape[ref] = geid + geid++ + table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")" + next + } + + variant = null + # converts + i = 2 + while (i <= NF) { + opcode = $(i++) + delete opnds + ext = null + flags = null + opnd = null + # parse one opcode + if (match($i, opnd_expr)) { + opnd = $i + split($(i++), opnds, ",") + flags = convert_operands(opnds) + } + if (match($i, ext_expr)) + ext = $(i++) + if (match($i, sep_expr)) + i++ + else if (i < NF) + semantic_error($i " is not a separator") + + # check if group opcode + if (match(opcode, group_expr)) { + if (!(opcode in group)) { + group[opcode] = ggid + ggid++ + } + flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")") + } + # check force(or default) 64bit + if (match(ext, force64_expr)) + flags = add_flags(flags, "INAT_FORCE64") + + # check REX prefix + if (match(opcode, rex_expr)) + flags = add_flags(flags, "INAT_REXPFX") + + # check coprocessor escape : TODO + if (match(opcode, fpu_expr)) + flags = add_flags(flags, "INAT_MODRM") + + # check prefixes + if (match(ext, prefix_expr)) { + if (!prefix_num[opcode]) + semantic_error("Unknown prefix: " opcode) + flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")") + } + if (length(flags) == 0) + continue + # check if last prefix + if (match(ext, lprefix1_expr)) { + lptable1[idx] = add_flags(lptable1[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix2_expr)) { + lptable2[idx] = add_flags(lptable2[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix3_expr)) { + lptable3[idx] = add_flags(lptable3[idx],flags) + variant = "INAT_VARIANT" + } else { + table[idx] = add_flags(table[idx],flags) + } + } + if (variant) + table[idx] = add_flags(table[idx],variant) +} + +END { + # print escape opcode map's array + print "/* Escape opcode map array */" + print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LPREFIX_MAX + 1] = {" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print " ["i"]["j"] = "etable[i,j]"," + print "};\n" + # print group opcode map's array + print "/* Group opcode map array */" + print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LPREFIX_MAX + 1] = {" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print " ["i"]["j"] = "gtable[i,j]"," + print "};" +} -- cgit v1.2.2 From ca0e9badd1a39fecdd235f4bf1481b9da756e27b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:21 -0400 Subject: x86: X86 instruction decoder build-time selftest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a user-space selftest of x86 instruction decoder at kernel build time. When CONFIG_X86_DECODER_SELFTEST=y, Kbuild builds a test harness of x86 instruction decoder and performs it after building vmlinux. The test compares the results of objdump and x86 instruction decoder code and check there are no differences. Signed-off-by: Masami Hiramatsu Signed-off-by: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: PrzemysÅ‚aw PaweÅ‚czyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203421.31965.29006.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/Kconfig.debug | 9 ++++ arch/x86/Makefile | 3 ++ arch/x86/tools/Makefile | 15 ++++++ arch/x86/tools/distill.awk | 42 ++++++++++++++++ arch/x86/tools/test_get_len.c | 113 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 182 insertions(+) create mode 100644 arch/x86/tools/Makefile create mode 100644 arch/x86/tools/distill.awk create mode 100644 arch/x86/tools/test_get_len.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d105f29bb6bb..7d0b681a132b 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -186,6 +186,15 @@ config X86_DS_SELFTEST config HAVE_MMIOTRACE_SUPPORT def_bool y +config X86_DECODER_SELFTEST + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL + ---help--- + Perform x86 instruction decoder selftests at build time. + This option is useful for checking the sanity of x86 instruction + decoder code. + If unsure, say "N". + # # IO delay types: # diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659c41b4..5fe16bfd15ac 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -154,6 +154,9 @@ all: bzImage KBUILD_IMAGE := $(boot)/bzImage bzImage: vmlinux +ifeq ($(CONFIG_X86_DECODER_SELFTEST),y) + $(Q)$(MAKE) $(build)=arch/x86/tools posttest +endif $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile new file mode 100644 index 000000000000..3dd626b99dc8 --- /dev/null +++ b/arch/x86/tools/Makefile @@ -0,0 +1,15 @@ +PHONY += posttest +quiet_cmd_posttest = TEST $@ + cmd_posttest = $(OBJDUMP) -d $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len + +posttest: $(obj)/test_get_len vmlinux + $(call cmd,posttest) + +hostprogs-y := test_get_len + +# -I needed for generated C source and C source which in the kernel tree. +HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ + +# Dependancies are also needed. +$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk new file mode 100644 index 000000000000..d433619bb866 --- /dev/null +++ b/arch/x86/tools/distill.awk @@ -0,0 +1,42 @@ +#!/bin/awk -f +# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len +# Distills the disassembly as follows: +# - Removes all lines except the disassembled instructions. +# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes +# into a single line. +# - Remove bad(or prefix only) instructions + +BEGIN { + prev_addr = "" + prev_hex = "" + prev_mnemonic = "" + bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" + fwait_expr = "^9b " + fwait_str="9b\tfwait" +} + +/^ *[0-9a-f]+:/ { + if (split($0, field, "\t") < 3) { + # This is a continuation of the same insn. + prev_hex = prev_hex field[2] + } else { + # Skip bad instructions + if (match(prev_mnemonic, bad_expr)) + prev_addr = "" + # Split fwait from other f* instructions + if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") { + printf "%s\t%s\n", prev_addr, fwait_str + sub(fwait_expr, "", prev_hex) + } + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic + prev_addr = field[1] + prev_hex = field[2] + prev_mnemonic = field[3] + } +} + +END { + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic +} diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c new file mode 100644 index 000000000000..1e81adb2d8a9 --- /dev/null +++ b/arch/x86/tools/test_get_len.c @@ -0,0 +1,113 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include +#include +#include +#include + +#ifdef __x86_64__ +#define CONFIG_X86_64 +#else +#define CONFIG_X86_32 +#endif +#define unlikely(cond) (cond) + +#include +#include +#include + +/* + * Test of instruction analysis in general and insn_get_length() in + * particular. See if insn_get_length() and the disassembler agree + * on the length of each instruction in an elf disassembly. + * + * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len + */ + +const char *prog; + +static void usage(void) +{ + fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" + " ./test_get_len\n"); + exit(1); +} + +static void malformed_line(const char *line, int line_nr) +{ + fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line); + exit(3); +} + +#define BUFSIZE 256 + +int main(int argc, char **argv) +{ + char line[BUFSIZE]; + unsigned char insn_buf[16]; + struct insn insn; + int insns = 0; + + prog = argv[0]; + if (argc > 1) + usage(); + + while (fgets(line, BUFSIZE, stdin)) { + char copy[BUFSIZE], *s, *tab1, *tab2; + int nb = 0; + unsigned int b; + + insns++; + memset(insn_buf, 0, 16); + strcpy(copy, line); + tab1 = strchr(copy, '\t'); + if (!tab1) + malformed_line(line, insns); + s = tab1 + 1; + s += strspn(s, " "); + tab2 = strchr(s, '\t'); + if (!tab2) + malformed_line(line, insns); + *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ + while (s < tab2) { + if (sscanf(s, "%x", &b) == 1) { + insn_buf[nb++] = (unsigned char) b; + s += 3; + } else + break; + } + /* Decode an instruction */ +#ifdef __x86_64__ + insn_init(&insn, insn_buf, 1); +#else + insn_init(&insn, insn_buf, 0); +#endif + insn_get_length(&insn); + if (insn.length != nb) { + fprintf(stderr, "Error: %s", line); + fprintf(stderr, "Error: objdump says %d bytes, but " + "insn_get_length() says %d (attr:%x)\n", nb, + insn.length, insn.attr); + exit(2); + } + } + fprintf(stderr, "Succeed: decoded and checked %d instructions\n", + insns); + return 0; +} -- cgit v1.2.2 From b46b3d70c9c017d7c4ec49f7f3ffd0af5a622277 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:28 -0400 Subject: kprobes: Checks probe address is instruction boudary on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure safeness of inserting kprobes by checking whether the specified address is at the first byte of an instruction on x86. This is done by decoding probed function from its head to the probe point. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: PrzemysÅ‚aw PaweÅ‚czyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203428.31965.21939.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..aa15f3e1f64b 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,12 +48,14 @@ #include #include #include +#include #include #include #include #include #include +#include void jprobe_return_end(void); @@ -244,6 +246,75 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, fix_riprel() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -359,6 +430,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) -- cgit v1.2.2 From 89ae465b0ee470f7d3f8a1c61353445c3acbbe2a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:36 -0400 Subject: kprobes: Cleanup fix_riprel() using insn decoder on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup fix_riprel() in arch/x86/kernel/kprobes.c by using the new x86 instruction decoder instead of using comparisons with raw ad hoc numeric opcodes. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: PrzemysÅ‚aw PaweÅ‚czyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203436.31965.34374.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 128 +++++++++------------------------------------- 1 file changed, 23 insertions(+), 105 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index aa15f3e1f64b..16ae9610f6ff 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -108,50 +108,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -348,68 +304,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static void __kprobes fix_riprel(struct kprobe *p) { #ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kernel_insn_init(&insn, p->ainsn.insn); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) p->addr + (s64) insn.displacement.value - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; } #endif } -- cgit v1.2.2 From b1cf540f0e5278ecfe8532557e547d833ed269d7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:44 -0400 Subject: x86: Add pt_regs register and stack access APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add following APIs for accessing registers and stack entries from pt_regs. These APIs are required by kprobes-based event tracer on ftrace. Some other debugging tools might be able to use it too. - regs_query_register_offset(const char *name) Query the offset of "name" register. - regs_query_register_name(unsigned int offset) Query the name of register by its offset. - regs_get_register(struct pt_regs *regs, unsigned int offset) Get the value of a register by its offset. - regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) Check the address is in the kernel stack. - regs_get_kernel_stack_nth(struct pt_regs *reg, unsigned int nth) Get Nth entry of the kernel stack. (N >= 0) - regs_get_argument_nth(struct pt_regs *reg, unsigned int nth) Get Nth argument at function call. (N >= 0) Signed-off-by: Masami Hiramatsu Cc: linux-arch@vger.kernel.org Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: PrzemysÅ‚aw PaweÅ‚czyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203444.31965.26374.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/ptrace.h | 62 +++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 112 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 0f0d908349aa..a3d49dd7d26e 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -7,6 +7,7 @@ #ifdef __KERNEL__ #include +#include #endif #ifndef __ASSEMBLY__ @@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->sp; } +/* Query offset/name of register from its name/offset */ +extern int regs_query_register_offset(const char *name); +extern const char *regs_query_register_name(unsigned int offset); +#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss)) + +/** + * regs_get_register() - get register value from its offset + * @regs: pt_regs from which register value is gotten. + * @offset: offset number of the register. + * + * regs_get_register returns the value of a register whose offset from @regs + * is @offset. The @offset is the offset of the register in struct pt_regs. + * If @offset is bigger than MAX_REG_OFFSET, this returns 0. + */ +static inline unsigned long regs_get_register(struct pt_regs *regs, + unsigned int offset) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return 0; + return *(unsigned long *)((unsigned long)regs + offset); +} + +/** + * regs_within_kernel_stack() - check the address in the stack + * @regs: pt_regs which contains kernel stack pointer. + * @addr: address which is checked. + * + * regs_within_kenel_stack() checks @addr is within the kernel stack page(s). + * If @addr is within the kernel stack, it returns true. If not, returns false. + */ +static inline int regs_within_kernel_stack(struct pt_regs *regs, + unsigned long addr) +{ + return ((addr & ~(THREAD_SIZE - 1)) == + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs: pt_regs which contains kernel stack pointer. + * @n: stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, + unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return *addr; + else + return 0; +} + +/* Get Nth argument at function call */ +extern unsigned long regs_get_argument_nth(struct pt_regs *regs, + unsigned int n); + /* * These are defined as per linux/ptrace.h, which see. */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8d7d5c9c1be3..a33a17d5d5c8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -49,6 +49,118 @@ enum x86_regset { REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +static const int arg_offs_table[] = { +#ifdef CONFIG_X86_32 + [0] = offsetof(struct pt_regs, ax), + [1] = offsetof(struct pt_regs, dx), + [2] = offsetof(struct pt_regs, cx) +#else /* CONFIG_X86_64 */ + [0] = offsetof(struct pt_regs, di), + [1] = offsetof(struct pt_regs, si), + [2] = offsetof(struct pt_regs, dx), + [3] = offsetof(struct pt_regs, cx), + [4] = offsetof(struct pt_regs, r8), + [5] = offsetof(struct pt_regs, r9) +#endif +}; + +/** + * regs_get_argument_nth() - get Nth argument at function call + * @regs: pt_regs which contains registers at function entry. + * @n: argument number. + * + * regs_get_argument_nth() returns @n th argument of a function call. + * Since usually the kernel stack will be changed right after function entry, + * you must use this at function entry. If the @n th entry is NOT in the + * kernel stack or pt_regs, this returns 0. + */ +unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) +{ + if (n < ARRAY_SIZE(arg_offs_table)) + return *((unsigned long *)regs + arg_offs_table[n]); + else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + n -= ARRAY_SIZE(arg_offs_table); + return regs_get_kernel_stack_nth(regs, 1 + n); + } +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. -- cgit v1.2.2 From 8d7d14fb27818eb08ebedf9f4a6e286970fe9977 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 21 Aug 2009 15:43:07 -0400 Subject: x86: Fix x86 instruction decoder selftest to check only .text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix x86 instruction decoder selftest to check only .text because other sections (e.g. .notes) will have random bytes which don't need to be checked. Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: H. Peter Anvin Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: PrzemysÅ‚aw PaweÅ‚czyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090821194307.12478.76938.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/tools/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 3dd626b99dc8..95e9cc4bcd94 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -1,6 +1,6 @@ PHONY += posttest quiet_cmd_posttest = TEST $@ - cmd_posttest = $(OBJDUMP) -d $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len + cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len posttest: $(obj)/test_get_len vmlinux $(call cmd,posttest) -- cgit v1.2.2 From 69d991f32152283cbc373136fa45bbb152b32048 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 21 Aug 2009 15:43:16 -0400 Subject: x86: Check awk features before generating inat-tables.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check some awk mandatory features to generate inat-tables.c that old mawk doesn't support. Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: H. Peter Anvin Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: PrzemysÅ‚aw PaweÅ‚czyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090821194316.12478.57394.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/tools/gen-insn-attr-x86.awk | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 93b62c92d044..19ba096b7dd1 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -4,7 +4,25 @@ # # Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c +# Awk implementation sanity check +function check_awk_implement() { + if (!match("abc", "[[:lower:]]+")) + return "Your awk doesn't support charactor-class." + if (sprintf("%x", 0) != "0") + return "Your awk has a printf-format problem." + return "" +} + BEGIN { + # Implementation error checking + awkchecked = check_awk_implement() + if (awkchecked != "") { + print "Error: " awkchecked > "/dev/stderr" + print "Please try to use gawk." > "/dev/stderr" + exit 1 + } + + # Setup generating tables print "/* x86 opcode map generated from x86-opcode-map.txt */" print "/* Do not change this code. */" ggid = 1 @@ -293,6 +311,8 @@ function convert_operands(opnd, i,imm,mod) } END { + if (awkchecked != "") + exit 1 # print escape opcode map's array print "/* Escape opcode map array */" print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ -- cgit v1.2.2 From 24851d2447830e6cba4c4b641cb73e713f312373 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 26 Aug 2009 23:38:30 +0200 Subject: tracing/kprobes: Dump the culprit kprobe in case of kprobe recursion Kprobes can enter into a probing recursion, ie: a kprobe that does an endless loop because one of its core mechanism function used during probing is also probed itself. This patch helps pinpointing the kprobe that raised such recursion by dumping it and raising a BUG instead of a warning (we also disarm the kprobe to try avoiding recursion in BUG itself). Having a BUG instead of a warning stops the stacktrace in the right place and doesn't pollute the logs with hundreds of traces that eventually end up in a stack overflow. Signed-off-by: Frederic Weisbecker Cc: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli --- arch/x86/kernel/kprobes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 16ae9610f6ff..ecee3d23fef8 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -490,9 +490,13 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, /* A probe has been hit in the codepath leading up * to, or just after, single-stepping of a probed * instruction. This entire codepath should strictly - * reside in .kprobes.text section. Raise a warning - * to highlight this peculiar case. + * reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless + * reentering loop and eventually a stack overflow. */ + arch_disarm_kprobe(p); + dump_kprobe(p); + BUG(); } default: /* impossible cases */ -- cgit v1.2.2 From c8bc6f3c806f1fcbfdbf0b1ff6c52dba59192d3b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 4 Aug 2009 12:07:09 -0700 Subject: x86: arch specific support for remapping HPET MSIs x86 arch support for remapping HPET MSI's by associating the HPET timer block with the interrupt-remapping HW unit and setting up appropriate irq_chip Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: David Woodhouse Cc: Jesse Barnes Cc: Jay Fenlason LKML-Reference: <20090804190729.630510000@intel.com> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hpet.h | 5 +++-- arch/x86/kernel/acpi/boot.c | 1 + arch/x86/kernel/apic/io_apic.c | 49 +++++++++++++++++++++++++++++++++++------- arch/x86/kernel/hpet.c | 3 ++- 4 files changed, 47 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 65847c578b70..5d89fd2a3690 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -65,6 +65,7 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; +extern u8 hpet_blockid; extern int hpet_force_user; extern int is_hpet_enabled(void); extern int hpet_enable(void); @@ -78,9 +79,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); #ifdef CONFIG_PCI_MSI -extern int arch_setup_hpet_msi(unsigned int irq); +extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id); #else -static inline int arch_setup_hpet_msi(unsigned int irq) +static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { return -EINVAL; } diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6b8ca3a0285d..eae642b0f345 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) } hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; /* * Some broken BIOSes advertise HPET at 0x0. We really do not diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d2ed6c5ddc80..d9c6f14d3b32 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3254,7 +3254,8 @@ void destroy_irq(unsigned int irq) * MSI message composition */ #ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) { struct irq_cfg *cfg; int err; @@ -3288,7 +3289,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms irte.dest_id = IRTE_DEST(dest); /* Set source-id of interrupt request */ - set_msi_sid(&irte, pdev); + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); modify_irte(irq, &irte); @@ -3453,7 +3457,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(dev, irq, &msg); + ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; @@ -3586,7 +3590,7 @@ int arch_setup_dmar_msi(unsigned int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); + ret = msi_compose_msg(NULL, irq, &msg, -1); if (ret < 0) return ret; dmar_msi_write(irq, &msg); @@ -3626,6 +3630,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ +static struct irq_chip ir_hpet_msi_type = { + .name = "IR-HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif +#endif + .retrigger = ioapic_retrigger_irq, +}; + static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, @@ -3637,20 +3654,36 @@ static struct irq_chip hpet_msi_type = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_hpet_msi(unsigned int irq) +int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { int ret; struct msi_msg msg; struct irq_desc *desc = irq_to_desc(irq); - ret = msi_compose_msg(NULL, irq, &msg); + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + } + + ret = msi_compose_msg(NULL, irq, &msg, id); if (ret < 0) return ret; hpet_msi_write(irq, &msg); desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, - "edge"); + if (irq_remapped(irq)) + set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, + handle_edge_irq, "edge"); + else + set_irq_chip_and_handler_name(irq, &hpet_msi_type, + handle_edge_irq, "edge"); return 0; } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba575f0f2e34..7f024ff47d1d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,6 +33,7 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -467,7 +468,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { - if (arch_setup_hpet_msi(irq)) { + if (arch_setup_hpet_msi(irq, hpet_blockid)) { destroy_irq(irq); return -EINVAL; } -- cgit v1.2.2 From e9afe9e1b3fdbd56cca53959a2519e70db9c8095 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:22:58 -0400 Subject: kprobes/x86: Call BUG() when reentering probe into KPROBES_HIT_SS Call BUG() when a probe have been hit on the way of kprobe processing path, because that kind of probes are currently unrecoverable (recovering it will cause an infinite loop and stack overflow). The original code seems to assume that it's caused by an int3 which another subsystem inserted on out-of-line singlestep buffer if the hitting probe is same as current probe. However, in that case, int3-hitting-address is on the out-of-line buffer and should be different from first (current) int3 address. Thus, I decided to remove the code. I also removes arch_disarm_kprobe() because it will involve other stuffs in text_poke(). Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172258.8246.61889.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index ecee3d23fef8..e0fb615ba1e9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -482,22 +482,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; break; case KPROBE_HIT_SS: - if (p == kprobe_running()) { - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= kcb->kprobe_saved_flags; - return 0; - } else { - /* A probe has been hit in the codepath leading up - * to, or just after, single-stepping of a probed - * instruction. This entire codepath should strictly - * reside in .kprobes.text section. - * Raise a BUG or we'll continue in an endless - * reentering loop and eventually a stack overflow. - */ - arch_disarm_kprobe(p); - dump_kprobe(p); - BUG(); - } + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); default: /* impossible cases */ WARN_ON(1); -- cgit v1.2.2 From f5ad31158d60946b9fd18c8a79c283a6bc432430 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:23:04 -0400 Subject: kprobes/x86-64: Allow to reenter probe on post_handler Allow to reenter probe on the post_handler of another probe on x86-64, because x86-64 already allows reentering int3. In that case, reentered probe just increases kp.nmissed and returns. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172304.8246.4822.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e0fb615ba1e9..c5f1f117e0c0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -463,17 +463,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: -#ifdef CONFIG_X86_64 - /* TODO: Provide re-entrancy from post_kprobes_handler() and - * avoid exception stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->ip = (unsigned long)p->addr; - reset_current_kprobe(); - preempt_enable_no_resched(); - break; -#endif case KPROBE_HIT_ACTIVE: save_previous_kprobe(kcb); set_current_kprobe(p, regs, kcb); -- cgit v1.2.2 From 62c9295f9dd250ea1bb2c8078642a275a9ce82f8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:23:11 -0400 Subject: kprobes/x86: Fix to add __kprobes to in-kernel fault handing functions Add __kprobes to the functions which handle in-kernel fixable page faults. Since kprobes can cause those in-kernel page faults by accessing kprobe data structures, probing those fault functions will cause fault-int3-loop (do_page_fault has already been marked as __kprobes). Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172311.8246.92725.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/mm/fault.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bfae139182ff..c322e59f2d10 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -38,7 +38,8 @@ enum x86_pf_error_code { * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ -static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) +static inline int __kprobes +kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) @@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) return 0; } -static inline int notify_page_fault(struct pt_regs *regs) +static inline int __kprobes notify_page_fault(struct pt_regs *regs) { int ret = 0; @@ -239,7 +240,7 @@ void vmalloc_sync_all(void) * * Handle a fault on the vmalloc or module mapping area */ -static noinline int vmalloc_fault(unsigned long address) +static noinline __kprobes int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; @@ -361,7 +362,7 @@ void vmalloc_sync_all(void) * * This assumes no large pages in there. */ -static noinline int vmalloc_fault(unsigned long address) +static noinline __kprobes int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; pud_t *pud, *pud_ref; @@ -858,7 +859,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int +static noinline __kprobes int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; -- cgit v1.2.2 From 8222d718b3ad3ae49c48f69ae4b6a1128c9a92cf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:23:25 -0400 Subject: kprobes/x86-64: Fix to move common_interrupt to .kprobes.text Since nmi, debug and int3 returns to irq_return inside common_interrupt, probing this function will cause int3-loop, so it should be marked as __kprobes. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172325.8246.40000.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/entry_64.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be745107..36e2ef5cc83f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -809,6 +809,10 @@ END(interrupt) call \func .endm +/* + * Interrupt entry/exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_interrupt. @@ -947,6 +951,10 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) +/* + * End of kprobes section + */ + .popsection /* * APIC interrupts. -- cgit v1.2.2 From 50a482fbd96943516b7a2783900e8fe61a6425e7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Aug 2009 18:13:19 -0400 Subject: x86: Allow x86-32 instruction decoder selftest on x86-64 Pass $(CONFIG_64BIT) to the x86 insn decoder selftest in case we are decoding 32bit code on x86-64, which will happen when building kernel with ARCH=i386 on x86-64. Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: Ingo Molnar LKML-Reference: <20090828221319.8778.88508.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/tools/Makefile | 2 +- arch/x86/tools/test_get_len.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 95e9cc4bcd94..1bd006c81564 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -1,6 +1,6 @@ PHONY += posttest quiet_cmd_posttest = TEST $@ - cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len + cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len $(CONFIG_64BIT) posttest: $(obj)/test_get_len vmlinux $(call cmd,posttest) diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 1e81adb2d8a9..a3273f4244d5 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -45,7 +45,7 @@ const char *prog; static void usage(void) { fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" - " ./test_get_len\n"); + " %s [y|n](64bit flag)\n", prog); exit(1); } @@ -63,11 +63,15 @@ int main(int argc, char **argv) unsigned char insn_buf[16]; struct insn insn; int insns = 0; + int x86_64 = 0; prog = argv[0]; - if (argc > 1) + if (argc > 2) usage(); + if (argc == 2 && argv[1][0] == 'y') + x86_64 = 1; + while (fgets(line, BUFSIZE, stdin)) { char copy[BUFSIZE], *s, *tab1, *tab2; int nb = 0; @@ -93,11 +97,7 @@ int main(int argc, char **argv) break; } /* Decode an instruction */ -#ifdef __x86_64__ - insn_init(&insn, insn_buf, 1); -#else - insn_init(&insn, insn_buf, 0); -#endif + insn_init(&insn, insn_buf, x86_64); insn_get_length(&insn); if (insn.length != nb) { fprintf(stderr, "Error: %s", line); -- cgit v1.2.2 From 70069577323e6f72b845166724f34b9858134437 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Aug 2009 18:13:26 -0400 Subject: x86: Remove unused config macros from instruction decoder selftest Remove dummy definitions of CONFIG_X86_64 and CONFIG_X86_32 because those macros are not used in the instruction decoder anymore. Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: Ingo Molnar LKML-Reference: <20090828221326.8778.70723.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/tools/test_get_len.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index a3273f4244d5..376d33852191 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -21,11 +21,6 @@ #include #include -#ifdef __x86_64__ -#define CONFIG_X86_64 -#else -#define CONFIG_X86_32 -#endif #define unlikely(cond) (cond) #include -- cgit v1.2.2 From f12b4f546b4e327d5620a544a2bddab68de66027 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Sep 2009 12:32:46 -0400 Subject: x86: Add MMX support for instruction decoder Add MMX/SSE instructions to x86 opcode maps, since some of those instructions are used in the kernel. This also fixes failures in the x86 instruction decoder seftest. Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: H. Peter Anvin Cc: Sam Ravnborg Cc: Frederic Weisbecker Cc: Ingo Molnar LKML-Reference: <20090908163246.23516.78835.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/lib/x86-opcode-map.txt | 307 ++++++++++++++++++++++++++-------------- 1 file changed, 200 insertions(+), 107 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 083dd59dd74b..59e20d5c2a52 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -310,14 +310,14 @@ Referrer: 2-byte escape 0e: 0f: # 0x0f 0x10-0x1f -10: -11: -12: -13: -14: -15: -16: -17: +10: movups Vps,Wps | movss Vss,Wss (F3) | movupd Vpd,Wpd (66) | movsd Vsd,Wsd (F2) +11: movups Wps,Vps | movss Wss,Vss (F3) | movupd Wpd,Vpd (66) | movsd Wsd,Vsd (F2) +12: movlps Vq,Mq | movlpd Vq,Mq (66) | movhlps Vq,Uq | movddup Vq,Wq (F2) | movsldup Vq,Wq (F3) +13: mpvlps Mq,Vq | movlpd Mq,Vq (66) +14: unpcklps Vps,Wq | unpcklpd Vpd,Wq (66) +15: unpckhps Vps,Wq | unpckhpd Vpd,Wq (66) +16: movhps Vq,Mq | movhpd Vq,Mq (66) | movlsps Vq,Uq | movshdup Vq,Wq (F3) +17: movhps Mq,Vq | movhpd Mq,Vq (66) 18: Grp16 (1A) 19: 1a: @@ -337,12 +337,12 @@ Referrer: 2-byte escape 27: 28: movaps Vps,Wps | movapd Vpd,Wpd (66) 29: movaps Wps,Vps | movapd Wpd,Vpd (66) -2a: -2b: -2c: -2d: -2e: -2f: +2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2) +2b: movntps Mps,Vps | movntpd Mpd,Vpd (66) +2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2) +2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2) +2e: ucomiss Vss,Wss | ucomisd Vsd,Wsd (66) +2f: comiss Vss,Wss | comisd Vsd,Wsd (66) # 0x0f 0x30-0x3f 30: WRMSR 31: RDTSC @@ -378,56 +378,56 @@ Referrer: 2-byte escape 4e: CMOVLE/NG Gv,Ev 4f: CMOVNLE/G Gv,Ev # 0x0f 0x50-0x5f -50: -51: -52: -53: -54: -55: -56: -57: -58: -59: -5a: -5b: -5c: -5d: -5e: -5f: +50: movmskps Gd/q,Ups | movmskpd Gd/q,Upd (66) +51: sqrtps Vps,Wps | sqrtss Vss,Wss (F3) | sqrtpd Vpd,Wpd (66) | sqrtsd Vsd,Wsd (F2) +52: rsqrtps Vps,Wps | rsqrtss Vss,Wss (F3) +53: rcpps Vps,Wps | rcpss Vss,Wss (F3) +54: andps Vps,Wps | andpd Vpd,Wpd (66) +55: andnps Vps,Wps | andnpd Vpd,Wpd (66) +56: orps Vps,Wps | orpd Vpd,Wpd (66) +57: xorps Vps,Wps | xorpd Vpd,Wpd (66) +58: addps Vps,Wps | addss Vss,Wss (F3) | addpd Vpd,Wpd (66) | addsd Vsd,Wsd (F2) +59: mulps Vps,Wps | mulss Vss,Wss (F3) | mulpd Vpd,Wpd (66) | mulsd Vsd,Wsd (F2) +5a: cvtps2pd Vpd,Wps | cvtss2sd Vsd,Wss (F3) | cvtpd2ps Vps,Wpd (66) | cvtsd2ss Vsd,Wsd (F2) +5b: cvtdq2ps Vps,Wdq | cvtps2dq Vdq,Wps (66) | cvttps2dq Vdq,Wps (F3) +5c: subps Vps,Wps | subss Vss,Wss (F3) | subpd Vpd,Wpd (66) | subsd Vsd,Wsd (F2) +5d: minps Vps,Wps | minss Vss,Wss (F3) | minpd Vpd,Wpd (66) | minsd Vsd,Wsd (F2) +5e: divps Vps,Wps | divss Vss,Wss (F3) | divpd Vpd,Wpd (66) | divsd Vsd,Wsd (F2) +5f: maxps Vps,Wps | maxss Vss,Wss (F3) | maxpd Vpd,Wpd (66) | maxsd Vsd,Wsd (F2) # 0x0f 0x60-0x6f -60: -61: -62: -63: -64: -65: -66: -67: -68: -69: -6a: -6b: -6c: -6d: -6e: -6f: +60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66) +61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66) +62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66) +63: packsswb Pq,Qq | packsswb Vdq,Wdq (66) +64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66) +65: pcmpgtw Pq,Qq | pcmpgtw(66) Vdq,Wdq +66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66) +67: packuswb Pq,Qq | packuswb(66) Vdq,Wdq +68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66) +69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66) +6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66) +6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66) +6c: punpcklqdq Vdq,Wdq (66) +6d: punpckhqdq Vdq,Wdq (66) +6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66) +6f: movq Pq,Qq | movdqa Vdq,Wdq (66) | movdqu Vdq,Wdq (F3) # 0x0f 0x70-0x7f -70: +70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66) | pshufhw Vdq,Wdq,Ib (F3) | pshuflw VdqWdq,Ib (F2) 71: Grp12 (1A) 72: Grp13 (1A) 73: Grp14 (1A) -74: -75: -76: -77: +74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66) +75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66) +76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66) +77: emms 78: VMREAD Ed/q,Gd/q 79: VMWRITE Gd/q,Ed/q 7a: 7b: -7c: -7d: -7e: -7f: +7c: haddps(F2) Vps,Wps | haddpd(66) Vpd,Wpd +7d: hsubps(F2) Vps,Wps | hsubpd(66) Vpd,Wpd +7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66) | movq Vq,Wq (F3) +7f: movq Qq,Pq | movdqa Wdq,Vdq (66) | movdqu Wdq,Vdq (F3) # 0x0f 0x80-0x8f 80: JO Jz (f64) 81: JNO Jz (f64) @@ -499,11 +499,11 @@ bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf c0: XADD Eb,Gb c1: XADD Ev,Gv -c2: +c2: cmpps Vps,Wps,Ib | cmpss Vss,Wss,Ib (F3) | cmppd Vpd,Wpd,Ib (66) | cmpsd Vsd,Wsd,Ib (F2) c3: movnti Md/q,Gd/q -c4: -c5: -c6: +c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66) +c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66) +c6: shufps Vps,Wps,Ib | shufpd Vpd,Wpd,Ib (66) c7: Grp9 (1A) c8: BSWAP RAX/EAX/R8/R8D c9: BSWAP RCX/ECX/R9/R9D @@ -514,60 +514,131 @@ cd: BSWAP RBP/EBP/R13/R13D ce: BSWAP RSI/ESI/R14/R14D cf: BSWAP RDI/EDI/R15/R15D # 0x0f 0xd0-0xdf -d0: -d1: -d2: -d3: -d4: -d5: -d6: -d7: -d8: -d9: -da: -db: -dc: -dd: -de: -df: +d0: addsubps Vps,Wps (F2) | addsubpd Vpd,Wpd (66) +d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66) +d2: psrld Pq,Qq | psrld Vdq,Wdq (66) +d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66) +d4: paddq Pq,Qq | paddq Vdq,Wdq (66) +d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66) +d6: movq Wq,Vq (66) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66) +d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66) +d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66) +da: pminub Pq,Qq | pminub Vdq,Wdq (66) +db: pand Pq,Qq | pand Vdq,Wdq (66) +dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66) +dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66) +de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66) +df: pandn Pq,Qq | pandn Vdq,Wdq (66) # 0x0f 0xe0-0xef -e0: -e1: -e2: -e3: -e4: -e5: -e6: -e7: -e8: -e9: -ea: -eb: -ec: -ed: -ee: -ef: +e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66) +e1: psraw Pq,Qq | psraw Vdq,Wdq (66) +e2: psrad Pq,Qq | psrad Vdq,Wdq (66) +e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66) +e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66) +e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66) +e6: cvtpd2dq Vdq,Wpd (F2) | cvttpd2dq Vdq,Wpd (66) | cvtdq2pd Vpd,Wdq (F3) +e7: movntq Mq,Pq | movntdq Mdq,Vdq (66) +e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66) +e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66) +ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66) +eb: por Pq,Qq | por Vdq,Wdq (66) +ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66) +ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66) +ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66) +ef: pxor Pq,Qq | pxor Vdq,Wdq (66) # 0x0f 0xf0-0xff -f0: -f1: -f2: -f3: -f4: -f5: -f6: -f7: -f8: -f9: -fa: -fb: -fc: -fd: -fe: +f0: lddqu Vdq,Mdq (F2) +f1: psllw Pq,Qq | psllw Vdq,Wdq (66) +f2: pslld Pq,Qq | pslld Vdq,Wdq (66) +f3: psllq Pq,Qq | psllq Vdq,Wdq (66) +f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66) +f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66) +f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66) +f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66) +f8: psubb Pq,Qq | psubb Vdq,Wdq (66) +f9: psubw Pq,Qq | psubw Vdq,Wdq (66) +fa: psubd Pq,Qq | psubd Vdq,Wdq (66) +fb: psubq Pq,Qq | psubq Vdq,Wdq (66) +fc: paddb Pq,Qq | paddb Vdq,Wdq (66) +fd: paddw Pq,Qq | paddw Vdq,Wdq (66) +fe: paddd Pq,Qq | paddd Vdq,Wdq (66) ff: EndTable Table: 3-byte opcode 1 Referrer: 3-byte escape 1 +# 0x0f 0x38 0x00-0x0f +00: pshufb Pq,Qq | pshufb Vdq,Wdq (66) +01: phaddw Pq,Qq | phaddw Vdq,Wdq (66) +02: phaddd Pq,Qq | phaddd Vdq,Wdq (66) +03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66) +04: pmaddubsw Pq,Qq | pmaddubsw (66)Vdq,Wdq +05: phsubw Pq,Qq | phsubw Vdq,Wdq (66) +06: phsubd Pq,Qq | phsubd Vdq,Wdq (66) +07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66) +08: psignb Pq,Qq | psignb Vdq,Wdq (66) +09: psignw Pq,Qq | psignw Vdq,Wdq (66) +0a: psignd Pq,Qq | psignd Vdq,Wdq (66) +0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66) +0c: +0d: +0e: +0f: +# 0x0f 0x38 0x10-0x1f +10: pblendvb Vdq,Wdq (66) +11: +12: +13: +14: blendvps Vdq,Wdq (66) +15: blendvpd Vdq,Wdq (66) +16: +17: ptest Vdq,Wdq (66) +18: +19: +1a: +1b: +1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66) +1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66) +1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66) +1f: +# 0x0f 0x38 0x20-0x2f +20: pmovsxbw Vdq,Udq/Mq (66) +21: pmovsxbd Vdq,Udq/Md (66) +22: pmovsxbq Vdq,Udq/Mw (66) +23: pmovsxwd Vdq,Udq/Mq (66) +24: pmovsxwq Vdq,Udq/Md (66) +25: pmovsxdq Vdq,Udq/Mq (66) +26: +27: +28: pmuldq Vdq,Wdq (66) +29: pcmpeqq Vdq,Wdq (66) +2a: movntdqa Vdq,Mdq (66) +2b: packusdw Vdq,Wdq (66) +2c: +2d: +2e: +2f: +# 0x0f 0x38 0x30-0x3f +30: pmovzxbw Vdq,Udq/Mq (66) +31: pmovzxbd Vdq,Udq/Md (66) +32: pmovzxbq Vdq,Udq/Mw (66) +33: pmovzxwd Vdq,Udq/Mq (66) +34: pmovzxwq Vdq,Udq/Md (66) +35: pmovzxdq Vdq,Udq/Mq (66) +36: +37: pcmpgtq Vdq,Wdq (66) +38: pminsb Vdq,Wdq (66) +39: pminsd Vdq,Wdq (66) +3a: pminuw Vdq,Wdq (66) +3b: pminud Vdq,Wdq (66) +3c: pmaxsb Vdq,Wdq (66) +3d: pmaxsd Vdq,Wdq (66) +3e: pmaxuw Vdq,Wdq (66) +3f: pmaxud Vdq,Wdq (66) +# 0x0f 0x38 0x4f-0xff +40: pmulld Vdq,Wdq (66) +41: phminposuw Vdq,Wdq (66) 80: INVEPT Gd/q,Mdq (66) 81: INVPID Gd/q,Mdq (66) f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) @@ -576,7 +647,29 @@ EndTable Table: 3-byte opcode 2 Referrer: 3-byte escape 2 -# all opcode is for SSE +# 0x0f 0x3a 0x00-0xff +08: roundps Vdq,Wdq,Ib (66) +09: roundpd Vdq,Wdq,Ib (66) +0a: roundss Vss,Wss,Ib (66) +0b: roundsd Vsd,Wsd,Ib (66) +0c: blendps Vdq,Wdq,Ib (66) +0d: blendpd Vdq,Wdq,Ib (66) +0e: pblendw Vdq,Wdq,Ib (66) +0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66) +14: pextrb Rd/Mb,Vdq,Ib (66) +15: pextrw Rd/Mw,Vdq,Ib (66) +16: pextrd/pextrq Ed/q,Vdq,Ib (66) +17: extractps Ed,Vdq,Ib (66) +20: pinsrb Vdq,Rd/q/Mb,Ib (66) +21: insertps Vdq,Udq/Md,Ib (66) +22: pinsrd/pinsrq Vdq,Ed/q,Ib (66) +40: dpps Vdq,Wdq,Ib (66) +41: dppd Vdq,Wdq,Ib (66) +42: mpsadbw Vdq,Wdq,Ib (66) +60: pcmpestrm Vdq,Wdq,Ib (66) +61: pcmpestri Vdq,Wdq,Ib (66) +62: pcmpistrm Vdq,Wdq,Ib (66) +63: pcmpistri Vdq,Wdq,Ib (66) EndTable GrpTable: Grp1 -- cgit v1.2.2 From a00e817f42663941ea0aa5f85a9d1c4f8b212839 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Sep 2009 12:47:55 -0400 Subject: kprobes/x86-32: Move irq-exit functions to kprobes section Move irq-exit functions to .kprobes.text section to protect against kprobes recursion. When I ran kprobe stress test on x86-32, I found below symbols cause unrecoverable recursive probing: ret_from_exception ret_from_intr check_userspace restore_all restore_all_notrace restore_nocheck irq_return And also, I found some interrupt/exception entry points that cause similar problems. This patch moves those symbols (including their container functions) to .kprobes.text section to prevent any kprobes probing. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Ingo Molnar LKML-Reference: <20090908164755.24050.81182.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/entry_32.S | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c6..beb30da203d6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -333,6 +333,10 @@ ENTRY(ret_from_fork) CFI_ENDPROC END(ret_from_fork) +/* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to @@ -383,6 +387,10 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -513,6 +521,10 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -705,6 +717,10 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* * System calls that need a pt_regs pointer. @@ -814,6 +830,10 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) +/* + * End of kprobes section + */ + .popsection ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder -- cgit v1.2.2 From ad5cafcdb09c57008c990edd309c0a563b09f238 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2009 19:53:06 -0400 Subject: x86/ptrace: Fix regs_get_argument_nth() to add correct offset Fix regs_get_argument_nth() to add correct offset bytes. Because offset_of() returns offset in byte, the offset should be added to char * instead of unsigned long *. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <20090910235306.22412.31613.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a33a17d5d5c8..caffb6809452 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -150,7 +150,7 @@ static const int arg_offs_table[] = { unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) { if (n < ARRAY_SIZE(arg_offs_table)) - return *((unsigned long *)regs + arg_offs_table[n]); + return *(unsigned long *)((char *)regs + arg_offs_table[n]); else { /* * The typical case: arg n is on the stack. -- cgit v1.2.2 From b8a4754147d61f5359a765a3afd3eb03012aa052 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 30 Jul 2009 11:10:02 +0200 Subject: x86, msr: Unify rdmsr_on_cpus/wrmsr_on_cpus Since rdmsr_on_cpus and wrmsr_on_cpus are almost identical, unify them into a common __rwmsr_on_cpus helper thus avoiding code duplication. While at it, convert cpumask_t's to const struct cpumask *. Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 4 ++-- arch/x86/lib/msr.c | 46 +++++++++++++++++++--------------------------- 2 files changed, 21 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 7e2b6ba962ff..9a00219b331a 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -247,8 +247,8 @@ do { \ #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); -void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); -void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 33a1e3ca22d8..41628b104b9e 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -71,14 +71,9 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) } EXPORT_SYMBOL(wrmsr_on_cpu); -/* rdmsr on a bunch of CPUs - * - * @mask: which CPUs - * @msr_no: which MSR - * @msrs: array of MSR values - * - */ -void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) +static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, + struct msr *msrs, + void (*msr_func) (void *info)) { struct msr_info rv; int this_cpu; @@ -92,11 +87,23 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) this_cpu = get_cpu(); if (cpumask_test_cpu(this_cpu, mask)) - __rdmsr_on_cpu(&rv); + msr_func(&rv); - smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); + smp_call_function_many(mask, msr_func, &rv, 1); put_cpu(); } + +/* rdmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); +} EXPORT_SYMBOL(rdmsr_on_cpus); /* @@ -107,24 +114,9 @@ EXPORT_SYMBOL(rdmsr_on_cpus); * @msrs: array of MSR values * */ -void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) { - struct msr_info rv; - int this_cpu; - - memset(&rv, 0, sizeof(rv)); - - rv.off = cpumask_first(mask); - rv.msrs = msrs; - rv.msr_no = msr_no; - - this_cpu = get_cpu(); - - if (cpumask_test_cpu(this_cpu, mask)) - __wrmsr_on_cpu(&rv); - - smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); - put_cpu(); + __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); } EXPORT_SYMBOL(wrmsr_on_cpus); -- cgit v1.2.2 From 9f0cf4adb6aa0bfccf675c938124e68f7f06349d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 26 Sep 2009 14:33:01 +0200 Subject: x86: Use __builtin_object_size() to validate the buffer size for copy_from_user() gcc (4.x) supports the __builtin_object_size() builtin, which reports the size of an object that a pointer point to, when known at compile time. If the buffer size is not known at compile time, a constant -1 is returned. This patch uses this feature to add a sanity check to copy_from_user(); if the target buffer is known to be smaller than the copy size, the copy is aborted and a WARNing is emitted in memory debug mode. These extra checks compile away when the object size is not known, or if both the buffer size and the copy length are constants. Signed-off-by: Arjan van de Ven LKML-Reference: <20090926143301.2c396b94@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess_32.h | 19 ++++++++++++++++++- arch/x86/include/asm/uaccess_64.h | 19 ++++++++++++++++++- arch/x86/kernel/x8664_ksyms_64.c | 2 +- arch/x86/lib/copy_user_64.S | 4 ++-- arch/x86/lib/usercopy_32.c | 4 ++-- 5 files changed, 41 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 632fb44b4cb5..582d6aef7417 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -187,9 +187,26 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n); -unsigned long __must_check copy_from_user(void *to, +unsigned long __must_check _copy_from_user(void *to, const void __user *from, unsigned long n); + +static inline unsigned long __must_check copy_from_user(void *to, + const void __user *from, + unsigned long n) +{ + int sz = __compiletime_object_size(to); + int ret = -EFAULT; + + if (likely(sz == -1 || sz >= n)) + ret = _copy_from_user(to, from, n); +#ifdef CONFIG_DEBUG_VM + else + WARN(1, "Buffer overflow detected!\n"); +#endif + return ret; +} + long __must_check strncpy_from_user(char *dst, const char __user *src, long count); long __must_check __strncpy_from_user(char *dst, diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index db24b215fc50..ce6fec7ce38d 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -21,10 +21,27 @@ copy_user_generic(void *to, const void *from, unsigned len); __must_check unsigned long copy_to_user(void __user *to, const void *from, unsigned len); __must_check unsigned long -copy_from_user(void *to, const void __user *from, unsigned len); +_copy_from_user(void *to, const void __user *from, unsigned len); __must_check unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len); +static inline unsigned long __must_check copy_from_user(void *to, + const void __user *from, + unsigned long n) +{ + int sz = __compiletime_object_size(to); + int ret = -EFAULT; + + if (likely(sz == -1 || sz >= n)) + ret = _copy_from_user(to, from, n); +#ifdef CONFIG_DEBUG_VM + else + WARN(1, "Buffer overflow detected!\n"); +#endif + return ret; +} + + static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 3909e3ba5ce3..a0cdd8cc1d67 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -30,7 +30,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); -EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(copy_to_user); EXPORT_SYMBOL(__copy_from_user_inatomic); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 6ba0f7bb85ea..4be3c415b3e9 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -78,7 +78,7 @@ ENTRY(copy_to_user) ENDPROC(copy_to_user) /* Standard copy_from_user with segment limit checking */ -ENTRY(copy_from_user) +ENTRY(_copy_from_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx @@ -88,7 +88,7 @@ ENTRY(copy_from_user) jae bad_from_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC -ENDPROC(copy_from_user) +ENDPROC(_copy_from_user) ENTRY(copy_user_generic) CFI_STARTPROC diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 1f118d462acc..8498684e45b0 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user); * data to the requested size using zero bytes. */ unsigned long -copy_from_user(void *to, const void __user *from, unsigned long n) +_copy_from_user(void *to, const void __user *from, unsigned long n) { if (access_ok(VERIFY_READ, from, n)) n = __copy_from_user(to, from, n); @@ -882,4 +882,4 @@ copy_from_user(void *to, const void __user *from, unsigned long n) memset(to, 0, n); return n; } -EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(_copy_from_user); -- cgit v1.2.2 From ff60fab71bb3b4fdbf8caf57ff3739ffd0887396 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 28 Sep 2009 14:21:22 +0200 Subject: x86: Use __builtin_memset and __builtin_memcpy for memset/memcpy GCC provides reasonable memset/memcpy functions itself, with __builtin_memset and __builtin_memcpy. For the "unknown" cases, it'll fall back to our current existing functions, but for fixed size versions it'll inline something smart. Quite often that will be the same as we have now, but sometimes it can do something smarter (for example, if the code then sets the first member of a struct, it can do a shorter memset). In addition, and this is more important, gcc knows which registers and such are not clobbered (while for our asm version it pretty much acts like a compiler barrier), so for various cases it can avoid reloading values. The effect on codesize is shown below on my typical laptop .config: text data bss dec hex filename 5605675 2041100 6525148 14171923 d83f13 vmlinux.before 5595849 2041668 6525148 14162665 d81ae9 vmlinux.after Due to some not-so-good behavior in the gcc 3.x series, this change is only done for GCC 4.x and above. Signed-off-by: Arjan van de Ven LKML-Reference: <20090928142122.6fc57e9c@infradead.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/string_32.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index ae907e617181..3d3e8353ee5c 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -177,10 +177,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) */ #ifndef CONFIG_KMEMCHECK + +#if (__GNUC__ >= 4) +#define memcpy(t, f, n) __builtin_memcpy(t, f, n) +#else #define memcpy(t, f, n) \ (__builtin_constant_p((n)) \ ? __constant_memcpy((t), (f), (n)) \ : __memcpy((t), (f), (n))) +#endif #else /* * kmemcheck becomes very happy if we use the REP instructions unconditionally, @@ -316,11 +321,15 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, : __memset_generic((s), (c), (count))) #define __HAVE_ARCH_MEMSET +#if (__GNUC__ >= 4) +#define memset(s, c, count) __builtin_memset(s, c, count) +#else #define memset(s, c, count) \ (__builtin_constant_p(c) \ ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ (count)) \ : __memset((s), (c), (count))) +#endif /* * find the first occurrence of byte 'c', or 1 past the area if none -- cgit v1.2.2 From 4a3127693001c61a21d1ce680db6340623f52e93 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 30 Sep 2009 13:05:23 +0200 Subject: x86: Turn the copy_from_user check into an (optional) compile time warning A previous patch added the buffer size check to copy_from_user(). One of the things learned from analyzing the result of the previous patch is that in general, gcc is really good at proving that the code contains sufficient security checks to not need to do a runtime check. But that for those cases where gcc could not prove this, there was a relatively high percentage of real security issues. This patch turns the case of "gcc cannot prove" into a compile time warning, as long as a sufficiently new gcc is in use that supports this. The objective is that these warnings will trigger developers checking new cases out before a security hole enters a linux kernel release. Signed-off-by: Arjan van de Ven Cc: Linus Torvalds Cc: "David S. Miller" Cc: James Morris Cc: Jan Beulich LKML-Reference: <20090930130523.348ae6c4@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess_32.h | 12 +++++++++--- arch/x86/lib/usercopy_32.c | 6 ++++++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 582d6aef7417..952f9e793c3e 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -191,6 +191,13 @@ unsigned long __must_check _copy_from_user(void *to, const void __user *from, unsigned long n); + +extern void copy_from_user_overflow(void) +#ifdef CONFIG_DEBUG_STACKOVERFLOW + __compiletime_warning("copy_from_user() buffer size is not provably correct") +#endif +; + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) @@ -200,10 +207,9 @@ static inline unsigned long __must_check copy_from_user(void *to, if (likely(sz == -1 || sz >= n)) ret = _copy_from_user(to, from, n); -#ifdef CONFIG_DEBUG_VM else - WARN(1, "Buffer overflow detected!\n"); -#endif + copy_from_user_overflow(); + return ret; } diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 8498684e45b0..e218d5df85ff 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -883,3 +883,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return n; } EXPORT_SYMBOL(_copy_from_user); + +void copy_from_user_overflow(void) +{ + WARN(1, "Buffer overflow detected!\n"); +} +EXPORT_SYMBOL(copy_from_user_overflow); -- cgit v1.2.2 From 7c68af6e32c73992bad24107311f3433c89016e2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 19 Sep 2009 09:40:22 +0300 Subject: core, x86: Add user return notifiers Add a general per-cpu notifier that is called whenever the kernel is about to return to userspace. The notifier uses a thread_info flag and existing checks, so there is no impact on user return or context switch fast paths. This will be used initially to speed up KVM task switching by lazily updating MSRs. Signed-off-by: Avi Kivity LKML-Reference: <1253342422-13811-1-git-send-email-avi@redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 7 +++++-- arch/x86/kernel/process.c | 2 ++ arch/x86/kernel/signal.c | 3 +++ 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8da93745c087..1df175d15aa8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -50,6 +50,7 @@ config X86 select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA select HAVE_ARCH_KMEMCHECK + select HAVE_USER_RETURN_NOTIFIER config OUTPUT_FORMAT string diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d27d0a2fec4c..375c917c37d2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,6 +83,7 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ +#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -107,6 +108,7 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) +#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) @@ -142,13 +144,14 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) + (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ + _TIF_USER_RETURN_NOTIFY) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW +#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) #define PREEMPT_ACTIVE 0x10000000 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5284cd2b5776..e51b056fc88f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -224,6 +225,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } + propagate_user_return_notify(prev_p, next_p); } int sys_fork(struct pt_regs *regs) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 6a44a76055ad..c49f90f7957a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (current->replacement_session_keyring) key_replace_session_keyring(); } + if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); -- cgit v1.2.2 From 63312b6a6faae3f2e5577f2b001e3b504f10a2aa Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 2 Oct 2009 07:50:50 -0700 Subject: x86: Add a Kconfig option to turn the copy_from_user warnings into errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For automated testing it is useful to have the option to turn the warnings on copy_from_user() etc checks into errors: In function ‘copy_from_user’, inlined from ‘fd_copyin’ at drivers/block/floppy.c:3080, inlined from ‘fd_ioctl’ at drivers/block/floppy.c:3503: linux/arch/x86/include/asm/uaccess_32.h:213: error: call to ‘copy_from_user_overflow’ declared with attribute error: copy_from_user buffer size is not provably correct Signed-off-by: Arjan van de Ven Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <20091002075050.4e9f7641@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 14 ++++++++++++++ arch/x86/include/asm/uaccess_32.h | 4 +++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d105f29bb6bb..1bd2e36f1538 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -287,4 +287,18 @@ config OPTIMIZE_INLINING If unsure, say N. +config DEBUG_STRICT_USER_COPY_CHECKS + bool "Strict copy size checks" + depends on DEBUG_KERNEL + ---help--- + Enabling this option turns a certain set of sanity checks for user + copy operations into compile time failures. + + The copy_from_user() etc checks are there to help test if there + are sufficient security checks on the length argument of + the copy operation, by having gcc prove that the argument is + within bounds. + + If unsure, or if you run an older (pre 4.4) gcc, say N. + endmenu diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 952f9e793c3e..0c9825e97f36 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -193,7 +193,9 @@ unsigned long __must_check _copy_from_user(void *to, extern void copy_from_user_overflow(void) -#ifdef CONFIG_DEBUG_STACKOVERFLOW +#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS + __compiletime_error("copy_from_user() buffer size is not provably correct") +#else __compiletime_warning("copy_from_user() buffer size is not provably correct") #endif ; -- cgit v1.2.2 From 98059e3463383b18fd79181179cd539b74846b47 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 1 Oct 2009 17:11:10 +0200 Subject: x86: AMD Geode LX optimizations Add CPU optimizations for AMD Geode LX. Signed-off-by: Matteo Croce LKML-Reference: <40101cc30910010811v5d15ff4cx9dd57c9cc9b4b045@mail.gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig.cpu | 2 +- arch/x86/Makefile_32.cpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 527519b8a9f9..979de294710d 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -406,7 +406,7 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) config X86_MINIMUM_CPU_FAMILY int diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 30e9a264f69d..cbf0776dbec1 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -41,7 +41,7 @@ cflags-$(CONFIG_X86_ELAN) += -march=i486 # Geode GX1 support cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx - +cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx) # add at the end to overwrite eventual tuning options from earlier # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) -- cgit v1.2.2 From c0b11d3af164947c71e2491912c5b8418900dafb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 25 Sep 2009 11:20:38 -0700 Subject: x86: Add VIA processor instructions in opcodes decoder Add VIA processor's Padlock instructions(MONTMUL, XSHA1, XSHA256) as parts of the kernel may use them. This fixes the following crash in opcodes decoder selftests: make[2]: `scripts/unifdef' is up to date. TEST posttest Error: c145cf71: f3 0f a6 d0 repz xsha256 Error: objdump says 4 bytes, but insn_get_length() says 3 (attr:0) make[1]: *** [posttest] Error 2 make: *** [bzImage] Error 2 Reported-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Acked-by: Ingo Molnar Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Tom Zanussi LKML-Reference: <20090925182037.10157.3180.stgit@omoto> Signed-off-by: Frederic Weisbecker --- arch/x86/lib/x86-opcode-map.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 59e20d5c2a52..78a0daf12e15 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -469,7 +469,7 @@ a2: CPUID a3: BT Ev,Gv a4: SHLD Ev,Gv,Ib a5: SHLD Ev,Gv,CL -a6: +a6: GrpPDLK a7: GrpRNG a8: PUSH GS (d64) a9: POP GS (d64) @@ -803,6 +803,12 @@ GrpTable: Grp16 3: prefetch T2 EndTable +GrpTable: GrpPDLK +0: MONTMUL +1: XSHA1 +2: XSHA2 +EndTable + GrpTable: GrpRNG 0: xstore-rng 1: xcrypt-ecb -- cgit v1.2.2 From 30ed1a79f5bf271d33e782afee3323582dcc621e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 3 Oct 2009 19:48:22 +0900 Subject: this_cpu: Implement X86 optimized this_cpu operations Basically the existing percpu ops can be used for this_cpu variants that allow operations also on dynamically allocated percpu data. However, we do not pass a reference to a percpu variable in. Instead a dynamically or statically allocated percpu variable is provided. Preempt, the non preempt and the irqsafe operations generate the same code. It will always be possible to have the requires per cpu atomicness in a single RMW instruction with segment override on x86. 64 bit this_cpu operations are not supported on 32 bit. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/percpu.h | 78 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index b65a36defeb7..8b5ec19bdef4 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -153,6 +153,84 @@ do { \ #define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) #define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) +#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) + +#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) +#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) +#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) +#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) +#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) +#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) + +#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) +#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) +#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) + +#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) +#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) +#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) + +/* + * Per cpu atomic 64 bit operations are only available under 64 bit. + * 32 bit must fall back to generic operations. + */ +#ifdef CONFIG_X86_64 +#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) + +#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) + +#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) + +#endif + /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ ({ \ -- cgit v1.2.2 From d6c304055b3cecd4ca865769ac7cea97a320727b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 7 Oct 2009 21:43:22 +0200 Subject: x86, msr: Remove the bkl from msr_open() Remove the big kernel lock from msr_open() as it doesn't protect anything there. The only racy event that can happen here is a concurrent cpu shutdown. So let's look at what could be racy during/after the above event: - The cpu_online() check is racy, but the bkl doesn't help about that anyway it disables preemption but we may be chcking another cpu than the current one. Also the cpu can still become offlined between open and read calls. - The cpu_data(cpu) returns a safe pointer too. It won't be released on cpu offlining. But some fields can be changed from arch/x86/kernel/smpboot.c:remove_siblinginfo() : - phys_proc_id - cpu_core_id Those are not read from msr_open(). What we are checking is the x86_capability that is left untouched on offlining. So this removal looks safe. Signed-off-by: Frederic Weisbecker Cc: John Kacur Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sven-Thorsten Dietrich LKML-Reference: <1254944602-7382-1-git-send-email-fweisbec@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/msr.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7dd950094178..c00610963238 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -174,21 +174,17 @@ static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); struct cpuinfo_x86 *c = &cpu_data(cpu); - int ret = 0; - lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { - ret = -ENXIO; /* No such CPU */ - goto out; - } + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + c = &cpu_data(cpu); if (!cpu_has(c, X86_FEATURE_MSR)) - ret = -EIO; /* MSR not supported */ -out: - unlock_kernel(); - return ret; + return -EIO; /* MSR not supported */ + + return 0; } /* -- cgit v1.2.2 From 170a0bc3808909d8ea0f3f9c725c6565efe7f9c4 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Wed, 7 Oct 2009 20:19:32 +0200 Subject: x86, cpuid: Remove the bkl from cpuid_open() Most of the variables are local to the function. It IS possible that for struct cpuinfo_x86 *c c could point to the same area. However, this is used read only. Signed-off-by: John Kacur LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpuid.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index b07af8861244..ef6928418c8f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -118,8 +118,6 @@ static int cpuid_open(struct inode *inode, struct file *file) struct cpuinfo_x86 *c; int ret = 0; - lock_kernel(); - cpu = iminor(file->f_path.dentry->d_inode); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { ret = -ENXIO; /* No such CPU */ @@ -129,7 +127,6 @@ static int cpuid_open(struct inode *inode, struct file *file) if (c->cpuid_level < 0) ret = -EIO; /* CPUID not supported */ out: - unlock_kernel(); return ret; } -- cgit v1.2.2 From 5a943617ef52e9f79cd7cf437aad8870be27aabb Mon Sep 17 00:00:00 2001 From: John Kacur Date: Thu, 8 Oct 2009 17:20:15 +0200 Subject: x86, cpuid: Simplify the code in cpuid_open Peter picked up my patch for tip/x86/cpu that removes the bkl in cpuid_open. Ingo subsequently merged that into tip/master. This patch folds back in tglx's 55968ede164ae523692f00717f50cd926f1382a0 to my patch that removed the bkl. This simplifies the code, and makes it consistent with the changes to kill the bkl in msr.c as well. Originally-by: Thomas Gleixner Signed-off-by: John Kacur Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpuid.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index ef6928418c8f..48e8e6558b26 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -116,18 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file) { unsigned int cpu; struct cpuinfo_x86 *c; - int ret = 0; cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { - ret = -ENXIO; /* No such CPU */ - goto out; - } + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + c = &cpu_data(cpu); if (c->cpuid_level < 0) - ret = -EIO; /* CPUID not supported */ -out: - return ret; + return -EIO; /* CPUID not supported */ + + return 0; } /* -- cgit v1.2.2 From 04a705df47d1ea27ca2b066f24b1951c51792d0d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 6 Oct 2009 16:42:08 +0200 Subject: perf_events: Check for filters on fixed counter events Intel fixed counters do not support all the filters possible with a generic counter. Thus, if a fixed counter event is passed but with certain filters set, then the fixed_mode_idx() function must fail and the event must be measured in a generic counter instead. Reject filters are: inv, edge, cnt-mask. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1254840129-6198-2-git-send-email-eranian@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 13 ++++++++++++- arch/x86/kernel/cpu/perf_event.c | 6 ++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ad7ce3fd5065..8d9f8548a870 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -28,9 +28,20 @@ */ #define ARCH_PERFMON_EVENT_MASK 0xffff +/* + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ +#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 + #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b5801c311846..1d16bd69551e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1349,6 +1349,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) if (!x86_pmu.num_events_fixed) return -1; + /* + * fixed counters do not take all possible filters + */ + if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) + return -1; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) -- cgit v1.2.2 From b690081d4d3f6a23541493f1682835c3cd5c54a1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 6 Oct 2009 16:42:09 +0200 Subject: perf_events: Add event constraints support for Intel processors On some Intel processors, not all events can be measured in all counters. Some events can only be measured in one particular counter, for instance. Assigning an event to the wrong counter does not crash the machine but this yields bogus counts, i.e., silent error. This patch changes the event to counter assignment logic to take into account event constraints for Intel P6, Core and Nehalem processors. There is no contraints on Intel Atom. There are constraints on Intel Yonah (Core Duo) but they are not provided in this patch given that this processor is not yet supported by perf_events. As a result of the constraints, it is possible for some event groups to never actually be loaded onto the PMU if they contain two events which can only be measured on a single counter. That situation can be detected with the scaling information extracted with read(). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1254840129-6198-3-git-send-email-eranian@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 109 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1d16bd69551e..9c758548a0e6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -77,6 +77,18 @@ struct cpu_hw_events { struct debug_store *ds; }; +struct event_constraint { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int code; +}; + +#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } +#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->idxmsk[0]; (e)++) + + /* * struct x86_pmu - generic x86 pmu */ @@ -102,6 +114,7 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); + int (*get_event_idx)(struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; @@ -110,6 +123,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +static const struct event_constraint *event_constraint; + /* * Not sure about some of these */ @@ -155,6 +170,16 @@ static u64 p6_pmu_raw_event(u64 hw_event) return hw_event & P6_EVNTSEL_MASK; } +static const struct event_constraint intel_p6_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; /* * Intel PerfMon v3. Used on Core2 and later. @@ -170,6 +195,35 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; +static const struct event_constraint intel_core_event_constraints[] = +{ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static const struct event_constraint intel_nehalem_event_constraints[] = +{ + EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ + EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -932,6 +986,8 @@ static int __hw_perf_event_init(struct perf_event *event) */ hwc->config = ARCH_PERFMON_EVENTSEL_INT; + hwc->idx = -1; + /* * Count user and OS events unless requested not to. */ @@ -1365,6 +1421,45 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) return -1; } +/* + * generic counter allocator: get next free counter + */ +static int gen_get_event_idx(struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); + return idx == x86_pmu.num_events ? -1 : idx; +} + +/* + * intel-specific counter allocator: check event constraints + */ +static int intel_get_event_idx(struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + const struct event_constraint *event_constraint; + int i, code; + + if (!event_constraint) + goto skip; + + code = hwc->config & 0xff; + + for_each_event_constraint(event_constraint, event_constraint) { + if (code == event_constraint->code) { + for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { + if (!test_and_set_bit(i, cpuc->used_mask)) + return i; + } + return -1; + } + } +skip: + return gen_get_event_idx(hwc); +} + /* * Find a PMC slot for the freshly enabled / scheduled in event: */ @@ -1402,11 +1497,10 @@ static int x86_pmu_enable(struct perf_event *event) } else { idx = hwc->idx; /* Try to get the previous generic event again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { + if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_events); - if (idx == x86_pmu.num_events) + idx = x86_pmu.get_event_idx(hwc); + if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); @@ -1883,6 +1977,7 @@ static struct x86_pmu p6_pmu = { */ .event_bits = 32, .event_mask = (1ULL << 32) - 1, + .get_event_idx = intel_get_event_idx, }; static struct x86_pmu intel_pmu = { @@ -1906,6 +2001,7 @@ static struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, + .get_event_idx = intel_get_event_idx, }; static struct x86_pmu amd_pmu = { @@ -1926,6 +2022,7 @@ static struct x86_pmu amd_pmu = { .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, + .get_event_idx = gen_get_event_idx, }; static int p6_pmu_init(void) @@ -1938,10 +2035,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ + event_constraint = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ + event_constraint = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -2013,12 +2112,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); + event_constraint = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + event_constraint = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: -- cgit v1.2.2 From fe9081cc9bdabb0be953a39ad977cea14e35bce5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Oct 2009 11:56:07 +0200 Subject: perf, x86: Add simple group validation Refuse to add events when the group wouldn't fit onto the PMU anymore. Naive implementation. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <1254911461.26976.239.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 90 ++++++++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9c758548a0e6..9961d845719d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -114,7 +114,8 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - int (*get_event_idx)(struct hw_perf_event *hwc); + int (*get_event_idx)(struct cpu_hw_events *cpuc, + struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; @@ -523,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event) #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL #define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ @@ -1390,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) x86_pmu_enable_event(hwc, idx); } -static int -fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) +static int fixed_mode_idx(struct hw_perf_event *hwc) { unsigned int hw_event; @@ -1424,9 +1424,9 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) /* * generic counter allocator: get next free counter */ -static int gen_get_event_idx(struct hw_perf_event *hwc) +static int +gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); @@ -1436,16 +1436,16 @@ static int gen_get_event_idx(struct hw_perf_event *hwc) /* * intel-specific counter allocator: check event constraints */ -static int intel_get_event_idx(struct hw_perf_event *hwc) +static int +intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); const struct event_constraint *event_constraint; int i, code; if (!event_constraint) goto skip; - code = hwc->config & 0xff; + code = hwc->config & CORE_EVNTSEL_EVENT_MASK; for_each_event_constraint(event_constraint, event_constraint) { if (code == event_constraint->code) { @@ -1457,26 +1457,22 @@ static int intel_get_event_idx(struct hw_perf_event *hwc) } } skip: - return gen_get_event_idx(hwc); + return gen_get_event_idx(cpuc, hwc); } -/* - * Find a PMC slot for the freshly enabled / scheduled in event: - */ -static int x86_pmu_enable(struct perf_event *event) +static int +x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; int idx; - idx = fixed_mode_idx(event, hwc); + idx = fixed_mode_idx(hwc); if (idx == X86_PMC_IDX_FIXED_BTS) { /* BTS is already occupied. */ if (test_and_set_bit(idx, cpuc->used_mask)) return -EAGAIN; hwc->config_base = 0; - hwc->event_base = 0; + hwc->event_base = 0; hwc->idx = idx; } else if (idx >= 0) { /* @@ -1499,17 +1495,33 @@ static int x86_pmu_enable(struct perf_event *event) /* Try to get the previous generic event again */ if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = x86_pmu.get_event_idx(hwc); + idx = x86_pmu.get_event_idx(cpuc, hwc); if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); hwc->idx = idx; } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; } + return idx; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = x86_schedule_event(cpuc, hwc); + if (idx < 0) + return idx; + perf_events_lapic_init(); x86_pmu.disable(hwc, idx); @@ -2212,11 +2224,47 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; +static int +validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event fake_event = event->hw; + + if (event->pmu != &pmu) + return 0; + + return x86_schedule_event(cpuc, &fake_event); +} + +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cpu_hw_events fake_pmu; + + memset(&fake_pmu, 0, sizeof(fake_pmu)); + + if (!validate_event(&fake_pmu, leader)) + return -ENOSPC; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + if (!validate_event(&fake_pmu, sibling)) + return -ENOSPC; + } + + if (!validate_event(&fake_pmu, event)) + return -ENOSPC; + + return 0; +} + const struct pmu *hw_perf_event_init(struct perf_event *event) { int err; err = __hw_perf_event_init(event); + if (!err) { + if (event->group_leader != event) + err = validate_group(event); + } if (err) { if (event->destroy) event->destroy(event); -- cgit v1.2.2 From f3834b9ef68067199486740b31f691afb14dbdf5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2009 10:12:46 +0200 Subject: x86: Generate cmpxchg build failures Rework the x86 cmpxchg() implementation to generate build failures when used on improper types. Signed-off-by: Peter Zijlstra Acked-by: Linus Torvalds LKML-Reference: <1254771187.21044.22.camel@laptop> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cmpxchg_32.h | 218 ++++++++++++++--------------------- arch/x86/include/asm/cmpxchg_64.h | 234 +++++++++++++++----------------------- 2 files changed, 177 insertions(+), 275 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 82ceb788a981..5371174cf5d0 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -8,14 +8,50 @@ * you need to test for the feature in boot_cpu_data. */ -#define xchg(ptr, v) \ - ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr)))) +extern void __xchg_wrong_size(void); + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway + * Note 2: xchg has side effect, so that attribute volatile is necessary, + * but generally the primitive is invalid, *ptr is output argument. --ANK + */ struct __xchg_dummy { unsigned long a[100]; }; #define __xg(x) ((struct __xchg_dummy *)(x)) +#define __xchg(x, ptr, size) \ +({ \ + __typeof(*(ptr)) __x = (x); \ + switch (size) { \ + case 1: \ + asm volatile("xchgb %b0,%1" \ + : "=q" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile("xchgw %w0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile("xchgl %0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + default: \ + __xchg_wrong_size(); \ + } \ + __x; \ +}) + +#define xchg(ptr, v) \ + __xchg((v), (ptr), sizeof(*ptr)) + /* * The semantics of XCHGCMP8B are a bit strange, this is why * there is a loop and the loading of %%eax and %%edx has to @@ -71,57 +107,63 @@ static inline void __set_64bit_var(unsigned long long *ptr, (unsigned int)((value) >> 32)) \ : __set_64bit(ptr, ll_low((value)), ll_high((value)))) -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK - */ -static inline unsigned long __xchg(unsigned long x, volatile void *ptr, - int size) -{ - switch (size) { - case 1: - asm volatile("xchgb %b0,%1" - : "=q" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 2: - asm volatile("xchgw %w0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 4: - asm volatile("xchgl %0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - } - return x; -} +extern void __cmpxchg_wrong_size(void); /* * Atomic compare and exchange. Compare OLD with MEM, if identical, * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case 1: \ + asm volatile(lock "cmpxchgb %b1,%2" \ + : "=a"(__ret) \ + : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile(lock "cmpxchgw %w1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile(lock "cmpxchgl %1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + __ret; \ +}) + +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) + +#define __sync_cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + +#define __cmpxchg_local(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "") #ifdef CONFIG_X86_CMPXCHG #define __HAVE_ARCH_CMPXCHG 1 -#define cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) -#define sync_cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) + +#define cmpxchg(ptr, old, new) \ + __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define sync_cmpxchg(ptr, old, new) \ + __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define cmpxchg_local(ptr, old, new) \ + __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) #endif #ifdef CONFIG_X86_CMPXCHG64 @@ -133,94 +175,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, (unsigned long long)(n))) #endif -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile(LOCK_PREFIX "cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - -/* - * Always use locked operations when touching memory shared with a - * hypervisor, since the system may be SMP even if the guest kernel - * isn't. - */ -static inline unsigned long __sync_cmpxchg(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile("lock; cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile("lock; cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile("lock; cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - -static inline unsigned long __cmpxchg_local(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile("cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile("cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile("cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, unsigned long long new) diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 52de72e0de8c..485ae415faec 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -3,9 +3,6 @@ #include /* Provides LOCK_PREFIX */ -#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \ - (ptr), sizeof(*(ptr)))) - #define __xg(x) ((volatile long *)(x)) static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) @@ -15,167 +12,118 @@ static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) #define _set_64bit set_64bit +extern void __xchg_wrong_size(void); +extern void __cmpxchg_wrong_size(void); + /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway * Note 2: xchg has side effect, so that attribute volatile is necessary, * but generally the primitive is invalid, *ptr is output argument. --ANK */ -static inline unsigned long __xchg(unsigned long x, volatile void *ptr, - int size) -{ - switch (size) { - case 1: - asm volatile("xchgb %b0,%1" - : "=q" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 2: - asm volatile("xchgw %w0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 4: - asm volatile("xchgl %k0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - case 8: - asm volatile("xchgq %0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) - : "memory"); - break; - } - return x; -} +#define __xchg(x, ptr, size) \ +({ \ + __typeof(*(ptr)) __x = (x); \ + switch (size) { \ + case 1: \ + asm volatile("xchgb %b0,%1" \ + : "=q" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile("xchgw %w0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile("xchgl %k0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile("xchgq %0,%1" \ + : "=r" (__x) \ + : "m" (*__xg(ptr)), "0" (__x) \ + : "memory"); \ + break; \ + default: \ + __xchg_wrong_size(); \ + } \ + __x; \ +}) + +#define xchg(ptr, v) \ + __xchg((v), (ptr), sizeof(*ptr)) + +#define __HAVE_ARCH_CMPXCHG 1 /* * Atomic compare and exchange. Compare OLD with MEM, if identical, * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case 1: \ + asm volatile(lock "cmpxchgb %b1,%2" \ + : "=a"(__ret) \ + : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile(lock "cmpxchgw %w1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile(lock "cmpxchgl %k1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile(lock "cmpxchgq %1,%2" \ + : "=a"(__ret) \ + : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + : "memory"); \ + break; \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + __ret; \ +}) -#define __HAVE_ARCH_CMPXCHG 1 +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 8: - asm volatile(LOCK_PREFIX "cmpxchgq %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} +#define __sync_cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") -/* - * Always use locked operations when touching memory shared with a - * hypervisor, since the system may be SMP even if the guest kernel - * isn't. - */ -static inline unsigned long __sync_cmpxchg(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile("lock; cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile("lock; cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile("lock; cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} +#define __cmpxchg_local(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "") -static inline unsigned long __cmpxchg_local(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - asm volatile("cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - asm volatile("cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - asm volatile("cmpxchgl %k1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 8: - asm volatile("cmpxchgq %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} +#define cmpxchg(ptr, old, new) \ + __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define sync_cmpxchg(ptr, old, new) \ + __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define cmpxchg_local(ptr, old, new) \ + __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) -#define cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), sizeof(*(ptr)))) #define cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ cmpxchg((ptr), (o), (n)); \ }) -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) -#define sync_cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), \ - sizeof(*(ptr)))) + #define cmpxchg64_local(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ -- cgit v1.2.2 From a6f05a6a0a1713d5b019f096799d49226807d3df Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 8 Oct 2009 18:02:54 -0700 Subject: x86-64: make compat_start_thread() match start_thread() For no real good reason, compat_start_thread() was embedded inline in whereas the native start_thread() lives in process_*.c. Move compat_start_thread() to process_64.c, remove gratuitious differences, and fix a few items which mostly look like bit rot. In particular, compat_start_thread() didn't do free_thread_xstate(), which means it was hanging on to the xstate store area even when it was not needed. It was also not setting old_rsp, but it looks like that generally shouldn't matter for a 32-bit process. Note: compat_start_thread *has* to be a macro, since it is tested with start_thread_ia32() as the out of line function name. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha --- arch/x86/include/asm/elf.h | 20 ++------------------ arch/x86/kernel/process_64.c | 23 ++++++++++++++++++++++- 2 files changed, 24 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 456a304b8172..8a024babe5e6 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -157,19 +157,6 @@ do { \ #define compat_elf_check_arch(x) elf_check_arch_ia32(x) -static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp) -{ - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - regs->ip = ip; - regs->sp = sp; - regs->flags = X86_EFLAGS_IF; - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; -} - static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { @@ -191,11 +178,8 @@ do { \ #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -#define compat_start_thread(regs, ip, sp) \ -do { \ - start_ia32_thread(regs, ip, sp); \ - set_fs(USER_DS); \ -} while (0) +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); +#define compat_start_thread start_thread_ia32 #define COMPAT_SET_PERSONALITY(ex) \ do { \ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..7cf0a6b6d4bb 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -356,7 +356,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) percpu_write(old_rsp, new_sp); regs->cs = __USER_CS; regs->ss = __USER_DS; - regs->flags = 0x200; + regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* * Free the old FP and other extended state @@ -365,6 +365,27 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) } EXPORT_SYMBOL_GPL(start_thread); +#ifdef CONFIG_IA32_EMULATION +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + loadsegment(fs, 0); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); + load_gs_index(0); + regs->ip = new_ip; + regs->sp = new_sp; + percpu_write(old_rsp, new_sp); + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + regs->flags = X86_EFLAGS_IF; + set_fs(USER_DS); + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +#endif + /* * switch_to(x,y) should switch tasks from x to y. * -- cgit v1.2.2 From e634d8fc792c66c3d4ff45518c04848c1e28f221 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 9 Oct 2009 15:56:53 -0700 Subject: x86-64: merge the standard and compat start_thread() functions The only thing left that differs between the standard and compat start_thread functions is the actual segment numbers and the prototype, so have a single common function which contains the guts and two very small wrappers. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha --- arch/x86/kernel/process_64.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7cf0a6b6d4bb..eb261c582a44 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -344,18 +344,20 @@ out: return err; } -void -start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) { loadsegment(fs, 0); - loadsegment(es, 0); - loadsegment(ds, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; percpu_write(old_rsp, new_sp); - regs->cs = __USER_CS; - regs->ss = __USER_DS; + regs->cs = _cs; + regs->ss = _ss; regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* @@ -363,26 +365,19 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) */ free_thread_xstate(current); } -EXPORT_SYMBOL_GPL(start_thread); + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} #ifdef CONFIG_IA32_EMULATION void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) { - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - regs->ip = new_ip; - regs->sp = new_sp; - percpu_write(old_rsp, new_sp); - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; - regs->flags = X86_EFLAGS_IF; - set_fs(USER_DS); - /* - * Free the old FP and other extended state - */ - free_thread_xstate(current); + start_thread_common(regs, new_ip, new_sp, + __USER32_CS, __USER32_DS, __USER32_DS); } #endif -- cgit v1.2.2 From 3bb258bf430d29a24350fe4f44f8bf07b7b7a8f6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 4 Oct 2009 17:53:29 -0700 Subject: ftrace.c: Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Remove prefixes from pr_, use pr_fmt(fmt). No change in output. Signed-off-by: Joe Perches Acked-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <9b377eefae9e28c599dd4a17bdc81172965e9931.1254701151.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9dbb527e1652..25e6f5fc4b1e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -9,6 +9,8 @@ * the dangers of modifying code on the run. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data) switch (faulted) { case 0: - pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); break; case 1: - pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + pr_info("converting mcount calls to 66 66 66 66 90\n"); memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); break; case 2: - pr_info("ftrace: converting mcount calls to jmp . + 5\n"); + pr_info("converting mcount calls to jmp . + 5\n"); memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); break; } -- cgit v1.2.2 From 3c355863fb32070a2800f41106519c5c3038623a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 4 Oct 2009 17:53:40 -0700 Subject: testmmiotrace.c: Add and use pr_fmt(fmt) - Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt. - Strip MODULE_NAME from pr_s. - Remove MODULE_NAME definition. Signed-off-by: Joe Perches LKML-Reference: <3bb66cc7f85f77b9416902e1be7076f7e3f4ad48.1254701151.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/testmmiotrace.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 427fd1b56df5..8565d944f7cf 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -1,12 +1,13 @@ /* * Written by Pekka Paalanen, 2008-2009 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -#define MODULE_NAME "testmmiotrace" - static unsigned long mmio_address; module_param(mmio_address, ulong, 0); MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " @@ -30,7 +31,7 @@ static unsigned v32(unsigned i) static void do_write_test(void __iomem *p) { unsigned int i; - pr_info(MODULE_NAME ": write test.\n"); + pr_info("write test.\n"); mmiotrace_printk("Write test.\n"); for (i = 0; i < 256; i++) @@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p) { unsigned int i; unsigned errs[3] = { 0 }; - pr_info(MODULE_NAME ": read test.\n"); + pr_info("read test.\n"); mmiotrace_printk("Read test.\n"); for (i = 0; i < 256; i++) @@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p) static void do_read_far_test(void __iomem *p) { - pr_info(MODULE_NAME ": read far test.\n"); + pr_info("read far test.\n"); mmiotrace_printk("Read far test.\n"); ioread32(p + read_far); @@ -78,7 +79,7 @@ static void do_test(unsigned long size) { void __iomem *p = ioremap_nocache(mmio_address, size); if (!p) { - pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); + pr_err("could not ioremap, aborting.\n"); return; } mmiotrace_printk("ioremap returned %p.\n", p); @@ -94,24 +95,22 @@ static int __init init(void) unsigned long size = (read_far) ? (8 << 20) : (16 << 10); if (mmio_address == 0) { - pr_err(MODULE_NAME ": you have to use the module argument " - "mmio_address.\n"); - pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" - " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + pr_err("you have to use the module argument mmio_address.\n"); + pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n"); return -ENXIO; } - pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " - "address space, and writing 16 kB of rubbish in there.\n", - size >> 10, mmio_address); + pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); do_test(size); - pr_info(MODULE_NAME ": All done.\n"); + pr_info("All done.\n"); return 0; } static void __exit cleanup(void) { - pr_debug(MODULE_NAME ": unloaded.\n"); + pr_debug("unloaded.\n"); } module_init(init); -- cgit v1.2.2 From fb2531953fd8855abdcf458459020fd382c5deca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 7 Oct 2009 13:20:38 +0200 Subject: mce, edac: Use an atomic notifier for MCEs decoding Add an atomic notifier which ensures proper locking when conveying MCE info to EDAC for decoding. The actual notifier call overrides a default, negative priority notifier. Note: make sure we register the default decoder only once since mcheck_init() runs on each CPU. Signed-off-by: Borislav Petkov LKML-Reference: <20091003065752.GA8935@liondog.tnic> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 3 ++- arch/x86/kernel/cpu/mcheck/mce.c | 29 ++++++++++++++++++++--------- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f1363b72364f..227a72df6441 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -108,6 +108,8 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) +extern struct atomic_notifier_head x86_mce_decoder_chain; + #ifdef __KERNEL__ #include @@ -213,6 +215,5 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); void mce_log_therm_throt_event(__u64 status); - #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1598a9436d0..15ba9c972d7a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -85,18 +85,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -static void default_decode_mce(struct mce *m) +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); + +static int default_decode_mce(struct notifier_block *nb, unsigned long val, + void *data) { pr_emerg("No human readable MCE decoding support on this CPU type.\n"); pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + + return NOTIFY_STOP; } -/* - * CPU/chipset specific EDAC code can register a callback here to print - * MCE errors in a human-readable form: - */ -void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; -EXPORT_SYMBOL(x86_mce_decode_callback); +static struct notifier_block mce_dec_nb = { + .notifier_call = default_decode_mce, + .priority = -1, +}; /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -204,9 +212,9 @@ static void print_mce(struct mce *m) /* * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that): + * (if the CPU has an implementation for that) */ - x86_mce_decode_callback(m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } static void print_mce_head(void) @@ -1420,6 +1428,9 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_cpu_features(c); mce_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); + + if (raw_smp_processor_id() == 0) + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); } /* -- cgit v1.2.2 From 494f6a9e12f5137d355d3ce3f5789ef148b642bc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 7 Oct 2009 19:04:29 -0400 Subject: this_cpu: Use this_cpu_xx in nmi handling this_cpu_inc/dec reduces the number of instructions needed. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/kernel/apic/nmi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 7ff61d6a188a..e631cc4416f7 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -360,7 +360,7 @@ void stop_apic_nmi_watchdog(void *unused) */ static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(long, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); void touch_nmi_watchdog(void) @@ -437,8 +437,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + __this_cpu_inc(per_cpu_var(alert_counter)); + if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -446,7 +446,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); + __this_cpu_write(per_cpu_var(alert_counter), 0); } /* see if the nmi watchdog went off */ -- cgit v1.2.2 From c03cb3149daed3e411657e3212d05ae27cf1a874 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 11 Oct 2009 10:33:02 -0700 Subject: x86: Relegate CONFIG_PAT and CONFIG_MTRR configurability to EMBEDDED MTRR and PAT support (which got added to CPUs over 10 years ago) are no longer really optional in that more and more things are depending on PAT just working, including various drivers and newer versions of X. (to not even speak of MTRR) Having this as a regular config option just no longer makes sense. This patch relegates CONFIG_X86_PAT to the EMBEDDED category so ultra-embedded can still disable it if they really need to. Also-Suggested-by: Roland Dreier Signed-off-by: Arjan van de Ven Cc: Linus Torvalds Cc: Henrique de Moraes Holschuh LKML-Reference: <20091011103302.62bded41@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c876bace8fdc..a67363bbe825 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1321,7 +1321,9 @@ config MATH_EMULATION kernel, it won't hurt. config MTRR - bool "MTRR (Memory Type Range Register) support" + bool + default y + prompt "MTRR (Memory Type Range Register) support" if EMBEDDED ---help--- On Intel P6 family processors (Pentium Pro, Pentium II and later) the Memory Type Range Registers (MTRRs) may be used to control @@ -1387,7 +1389,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT config X86_PAT bool - prompt "x86 PAT support" + default y + prompt "x86 PAT support" if EMBEDDED depends on MTRR ---help--- Use PAT attributes to setup page level cache control. -- cgit v1.2.2 From ae24ffe5ecec17c956ac25371d7c2e12b4b36e53 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 12 Oct 2009 10:18:23 -0400 Subject: x86, 64-bit: Move K8 B step iret fixup to fault entry asm Move the handling of truncated %rip from an iret fault to the fault entry path. This allows x86-64 to use the standard search_extable() function. Signed-off-by: Brian Gerst Cc: Linus Torvalds Cc: Jan Beulich LKML-Reference: <1255357103-5418-1-git-send-email-brgerst@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 1 - arch/x86/kernel/entry_64.S | 11 ++++++++--- arch/x86/mm/extable.c | 31 ------------------------------- 3 files changed, 8 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index d2c6c930b491..abd3e0ea762a 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -570,7 +570,6 @@ extern struct movsl_mask { #ifdef CONFIG_X86_32 # include "uaccess_32.h" #else -# define ARCH_HAS_SEARCH_EXTABLE # include "uaccess_64.h" #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f8f358..af0f4b226dbe 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1491,12 +1491,17 @@ error_kernelspace: leaq irq_return(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP+8(%rsp) - je error_swapgs + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret cmpq $gs_change,RIP+8(%rsp) je error_swapgs jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + je error_swapgs END(error_entry) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 61b41ca3b5a2..d0474ad2a6e5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs) return 0; } - -#ifdef CONFIG_X86_64 -/* - * Need to defined our own search_extable on X86_64 to work around - * a B stepping K8 bug. - */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - /* B stepping K8 bug */ - if ((value >> 32) == 0) - value |= 0xffffffffUL << 32; - - while (first <= last) { - const struct exception_table_entry *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - return NULL; -} -#endif -- cgit v1.2.2 From ad8f4356af58f7ded6b4a5787c67c7cab51066b5 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 6 Oct 2009 07:04:52 -0700 Subject: x86: Don't use the strict copy checks when branch profiling is in use The branch profiling creates very complex code for each if statement, to the point that gcc has trouble even analyzing something as simple as if (count > 5) count = 5; This then means that causing an error on code that gcc cannot analyze for copy_from_user() and co is not very productive. This patch excludes the strict copy checks in the case of branch profiling being enabled. Signed-off-by: Arjan van de Ven Cc: Steven Rostedt LKML-Reference: <20091006070452.5e1fc119@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 1bd2e36f1538..fb772b6a41ad 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -289,7 +289,7 @@ config OPTIMIZE_INLINING config DEBUG_STRICT_USER_COPY_CHECKS bool "Strict copy size checks" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING ---help--- Enabling this option turns a certain set of sanity checks for user copy operations into compile time failures. -- cgit v1.2.2 From 1af5ba514f0c2f2e2af965a4ffa5e8ab269271b9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 25 Sep 2009 15:19:47 -0700 Subject: x86: Clean up and add missing log levels for k8 Convert all printk's in arch/x86/mm/k8topology_64.c to use pr_info() or pr_err() appropriately. Adds log levels for messages currently lacking them. Signed-off-by: David Rientjes Cc: Yinghai Lu Cc: Balbir Singh Cc: Ankita Garg Cc: Len Brown LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/k8topology_64.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 268f8255280f..a81561acc20f 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -91,14 +91,14 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (nb < 0) return nb; - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + pr_info("Scanning NUMA topology in Northbridge %d\n", nb); reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) return -1; - printk(KERN_INFO "Number of nodes %d\n", numnodes); + pr_info("Number of nodes %d\n", numnodes); memset(&nodes, 0, sizeof(nodes)); prevbase = 0; @@ -111,28 +111,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodeid = limit & 7; if ((base & 3) == 0) { if (i < numnodes) - printk("Skipping disabled node %d\n", i); + pr_info("Skipping disabled node %d\n", i); continue; } if (nodeid >= numnodes) { - printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); + pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, + base, limit); continue; } if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", - i, base); + pr_info("Skipping node entry %d (base %lx)\n", + i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); + pr_err("Node %d using interleaving mode %lx/%lx\n", + nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -1; } if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", - nodeid); + pr_info("Node %d already present, skipping\n", + nodeid); continue; } @@ -154,24 +154,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (limit > end) limit = end; if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); + pr_err("Empty node %d\n", nodeid); continue; } if (limit < base) { - printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", + pr_err("Node %d bogus settings %lx-%lx.\n", nodeid, base, limit); continue; } /* Could sort here, but pun for now. Should not happen anyroads. */ if (prevbase > base) { - printk(KERN_ERR "Node map not sorted %lx,%lx\n", + pr_err("Node map not sorted %lx,%lx\n", prevbase, base); return -1; } - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); + pr_info("Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); found++; @@ -188,10 +188,10 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) memnode_shift = compute_hash_shift(nodes, 8, NULL); if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + pr_err("No NUMA node hash function found. Contact maintainer\n"); return -1; } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + pr_info("Using node hash shift of %d\n", memnode_shift); /* use the coreid bits from early_identify_cpu */ bits = boot_cpu_data.x86_coreid_bits; @@ -200,8 +200,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) /* need to get boot_cpu_id early for system with apicid lifting */ early_get_boot_cpu_id(); if (boot_cpu_physical_apicid > 0) { - printk(KERN_INFO "BSP APIC ID: %02x\n", - boot_cpu_physical_apicid); + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); apicid_base = boot_cpu_physical_apicid; } -- cgit v1.2.2 From 8ee2debce32412118cf8c239e0026ace56ea1425 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 25 Sep 2009 15:20:00 -0700 Subject: x86: Export k8 physical topology To eventually interleave emulated nodes over physical nodes, we need to know the physical topology of the machine without actually registering it. This does the k8 node setup in two parts: detection and registration. NUMA emulation can then used the physical topology detected to setup the address ranges of emulated nodes accordingly. If emulation isn't used, the k8 nodes are registered as normal. Two formals are added to the x86 NUMA setup functions: `acpi' and `k8'. These represent whether ACPI or K8 NUMA has been detected; both cannot be true at the same time. This specifies to the NUMA emulation code whether an underlying physical NUMA topology exists and which interface to use. This patch deals solely with separating the k8 setup path into Northbridge detection and registration steps and leaves the ACPI changes for a subsequent patch. The `acpi' formal is added here, however, to avoid touching all the header files again in the next patch. This approach also ensures emulated nodes will not span physical nodes so the true memory latency is not misrepresented. k8_get_nodes() may now be used to export the k8 physical topology of the machine for NUMA emulation. Signed-off-by: David Rientjes Cc: Andreas Herrmann Cc: Yinghai Lu Cc: Balbir Singh Cc: Ankita Garg Cc: Len Brown LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/k8.h | 4 ++- arch/x86/include/asm/page_types.h | 3 ++- arch/x86/kernel/setup.c | 10 +++++++- arch/x86/mm/init_32.c | 4 +-- arch/x86/mm/init_64.c | 3 ++- arch/x86/mm/k8topology_64.c | 52 +++++++++++++++++++++++++++++---------- arch/x86/mm/numa_32.c | 4 +-- arch/x86/mm/numa_64.c | 6 ++--- 8 files changed, 62 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index c2d1f3b58e5f..c092f720bd60 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -10,7 +10,9 @@ extern struct pci_dev **k8_northbridges; extern int num_k8_northbridges; extern int cache_k8_northbridges(void); extern void k8_flush_garts(void); -extern int k8_scan_nodes(unsigned long start, unsigned long end); +extern int k8_get_nodes(struct bootnode *nodes); +extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); +extern int k8_scan_nodes(void); #ifdef CONFIG_K8_NB static inline struct pci_dev *node_to_k8_nb_misc(int node) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 6473f5ccff85..642fe34b36a2 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped; extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8); extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e09f0e2c14b5..fda0032c25c6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -106,6 +106,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include #endif @@ -691,6 +692,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { void __init setup_arch(char **cmdline_p) { + int acpi = 0; + int k8 = 0; + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -937,7 +941,11 @@ void __init setup_arch(char **cmdline_p) acpi_numa_init(); #endif - initmem_init(0, max_pfn); +#ifdef CONFIG_K8_NUMA + k8 = !k8_numa_init(0, max_pfn); +#endif + + initmem_init(0, max_pfn, acpi, k8); #ifdef CONFIG_ACPI_SLEEP /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 30938c1d8d5d..5e32b07b535d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -703,8 +703,8 @@ void __init find_low_pfn_range(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5a4398a6006b..c20d30b440de 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start, } #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { unsigned long bootmap_size, bootmap; diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index a81561acc20f..b9e2dbfe55c3 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -24,6 +24,9 @@ #include #include +static struct bootnode __initdata nodes[8]; +static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; + static __init int find_northbridge(void) { int num; @@ -76,12 +79,26 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -int __init k8_scan_nodes(unsigned long start, unsigned long end) +int __init k8_get_nodes(struct bootnode *physnodes) { - unsigned numnodes, cores, bits, apicid_base; + int i; + int ret = 0; + + for_each_node_mask(i, nodes_parsed) { + physnodes[ret].start = nodes[i].start; + physnodes[ret].end = nodes[i].end; + ret++; + } + return ret; +} + +int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long start = PFN_PHYS(start_pfn); + unsigned long end = PFN_PHYS(end_pfn); + unsigned numnodes; unsigned long prevbase; - struct bootnode nodes[8]; - int i, j, nb, found = 0; + int i, nb, found = 0; u32 nodeid, reg; if (!early_pci_allowed()) @@ -98,9 +115,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (numnodes <= 1) return -1; - pr_info("Number of nodes %d\n", numnodes); + pr_info("Number of physical nodes %d\n", numnodes); - memset(&nodes, 0, sizeof(nodes)); prevbase = 0; for (i = 0; i < 8; i++) { unsigned long base, limit; @@ -130,7 +146,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -1; } - if (node_isset(nodeid, node_possible_map)) { + if (node_isset(nodeid, nodes_parsed)) { pr_info("Node %d already present, skipping\n", nodeid); continue; @@ -141,8 +157,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) limit |= (1<<24)-1; limit++; - if (limit > max_pfn << PAGE_SHIFT) - limit = max_pfn << PAGE_SHIFT; + if (limit > end) + limit = end; if (limit <= base) continue; @@ -180,12 +196,23 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; - node_set(nodeid, node_possible_map); + node_set(nodeid, nodes_parsed); } if (!found) return -1; + return 0; +} +int __init k8_scan_nodes(void) +{ + unsigned int bits; + unsigned int cores; + unsigned int apicid_base; + int i; + + BUG_ON(nodes_empty(nodes_parsed)); + node_possible_map = nodes_parsed; memnode_shift = compute_hash_shift(nodes, 8, NULL); if (memnode_shift < 0) { pr_err("No NUMA node hash function found. Contact maintainer\n"); @@ -204,9 +231,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) apicid_base = boot_cpu_physical_apicid; } - for (i = 0; i < 8; i++) { - if (nodes[i].start == nodes[i].end) - continue; + for_each_node_mask(i, node_possible_map) { + int j; e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d2530062fe00..b20760ca7244 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -347,8 +347,8 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { int nid; long kva_target_pfn; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 459913beac71..dad5f42dd359 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -524,7 +524,8 @@ out: } #endif /* CONFIG_NUMA_EMU */ -void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, + int acpi, int k8) { int i; @@ -547,8 +548,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) #endif #ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn< Date: Fri, 25 Sep 2009 15:20:04 -0700 Subject: x86: Export srat physical topology This is the counterpart to "x86: export k8 physical topology" for SRAT. It is not as invasive because the acpi code already seperates node setup into detection and registration steps, with the exception of registering e820 active regions in acpi_numa_memory_affinity_init(). This is now moved to acpi_scan_nodes() if NUMA emulation is disabled or deferred. acpi_numa_init() now returns a value which specifies whether an underlying SRAT was located. If so, that topology can be used by the emulation code to interleave emulated nodes over physical nodes or to register the nodes for ACPI. acpi_get_nodes() may now be used to export the srat physical topology of the machine for NUMA emulation. Signed-off-by: David Rientjes Cc: Andreas Herrmann Cc: Yinghai Lu Cc: Balbir Singh Cc: Ankita Garg Cc: Len Brown LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/acpi.h | 1 + arch/x86/kernel/setup.c | 5 +++-- arch/x86/mm/numa_64.c | 4 ++-- arch/x86/mm/srat_64.c | 28 +++++++++++++++++++++------- 4 files changed, 27 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 4518dc500903..e3d4a0daff57 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -158,6 +158,7 @@ struct bootnode; #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; +extern int acpi_get_nodes(struct bootnode *physnodes); extern int acpi_scan_nodes(unsigned long start, unsigned long end); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) extern void acpi_fake_nodes(const struct bootnode *fake_nodes, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fda0032c25c6..f89141982702 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -938,11 +938,12 @@ void __init setup_arch(char **cmdline_p) /* * Parse SRAT to discover nodes. */ - acpi_numa_init(); + acpi = acpi_numa_init(); #endif #ifdef CONFIG_K8_NUMA - k8 = !k8_numa_init(0, max_pfn); + if (!acpi) + k8 = !k8_numa_init(0, max_pfn); #endif initmem_init(0, max_pfn, acpi, k8); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index dad5f42dd359..d1a3d94efc8e 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -540,8 +540,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, #endif #ifdef CONFIG_ACPI_NUMA - if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT)) + if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, + last_pfn << PAGE_SHIFT)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index dbb5381f7b3b..891cbe65b2d5 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, start, end); - e820_register_active_regions(node, start >> PAGE_SHIFT, - end >> PAGE_SHIFT); if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { update_nodes_add(node, start, end); @@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) void __init acpi_numa_arch_fixup(void) {} +int __init acpi_get_nodes(struct bootnode *physnodes) +{ + int i; + int ret = 0; + + for_each_node_mask(i, nodes_parsed) { + physnodes[ret].start = nodes[i].start; + physnodes[ret].end = nodes[i].end; + ret++; + } + return ret; +} + /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(unsigned long start, unsigned long end) { @@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); - if (!nodes_cover_memory(nodes)) { - bad_srat(); - return -1; - } - memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, memblk_nodeid); if (memnode_shift < 0) { @@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } + for_each_node_mask(i, nodes_parsed) + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); + if (!nodes_cover_memory(nodes)) { + bad_srat(); + return -1; + } + /* Account for nodes with cpus and no memory */ nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); -- cgit v1.2.2 From adc1938994f7f1112d335d998b5218b0aa680ad6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 25 Sep 2009 15:20:09 -0700 Subject: x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes Cc: Linus Torvalds Cc: Andreas Herrmann Cc: Yinghai Lu Cc: Balbir Singh Cc: Ankita Garg Cc: Len Brown LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 211 +++++++++++++++++++++++++++++++++++++++++++------- arch/x86/mm/srat_64.c | 1 - 2 files changed, 184 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index d1a3d94efc8e..086f98a66d80 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -306,8 +306,71 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ +static struct bootnode nodes[MAX_NUMNODES] __initdata; +static struct bootnode physnodes[MAX_NUMNODES] __initdata; static char *cmdline __initdata; +static int __init setup_physnodes(unsigned long start, unsigned long end, + int acpi, int k8) +{ + int nr_nodes = 0; + int ret = 0; + int i; + +#ifdef CONFIG_ACPI_NUMA + if (acpi) + nr_nodes = acpi_get_nodes(physnodes); +#endif +#ifdef CONFIG_K8_NUMA + if (k8) + nr_nodes = k8_get_nodes(physnodes); +#endif + /* + * Basic sanity checking on the physical node map: there may be errors + * if the SRAT or K8 incorrectly reported the topology or the mem= + * kernel parameter is used. + */ + for (i = 0; i < nr_nodes; i++) { + if (physnodes[i].start == physnodes[i].end) + continue; + if (physnodes[i].start > end) { + physnodes[i].end = physnodes[i].start; + continue; + } + if (physnodes[i].end < start) { + physnodes[i].start = physnodes[i].end; + continue; + } + if (physnodes[i].start < start) + physnodes[i].start = start; + if (physnodes[i].end > end) + physnodes[i].end = end; + } + + /* + * Remove all nodes that have no memory or were truncated because of the + * limited address range. + */ + for (i = 0; i < nr_nodes; i++) { + if (physnodes[i].start == physnodes[i].end) + continue; + physnodes[ret].start = physnodes[i].start; + physnodes[ret].end = physnodes[i].end; + ret++; + } + + /* + * If no physical topology was detected, a single node is faked to cover + * the entire address space. + */ + if (!ret) { + physnodes[ret].start = start; + physnodes[ret].end = end; + ret = 1; + } + return ret; +} + /* * Setups up nid to range from addr to addr + size. If the end * boundary is greater than max_addr, then max_addr is used instead. @@ -315,11 +378,9 @@ static char *cmdline __initdata; * allocation past addr and -1 otherwise. addr is adjusted to be at * the end of the node. */ -static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, - u64 size, u64 max_addr) +static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) { int ret = 0; - nodes[nid].start = *addr; *addr += size; if (*addr >= max_addr) { @@ -334,13 +395,112 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, return ret; } +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. The return value is the number of nodes allocated. + */ +static int __init split_nodes_interleave(u64 addr, u64 max_addr, + int nr_phys_nodes, int nr_nodes) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 size; + int big; + int ret = 0; + int i; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + for (i = 0; i < nr_phys_nodes; i++) + if (physnodes[i].start != physnodes[i].end) + node_set(i, physnode_mask); + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 end = physnodes[i].start + size; + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + + if (ret < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - physnodes[i].start - + e820_hole_size(physnodes[i].start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > physnodes[i].end) { + end = physnodes[i].end; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (physnodes[i].end - end - + e820_hole_size(end, physnodes[i].end) < size) + end = physnodes[i].end; + + /* + * Avoid allocating more nodes than requested, which can + * happen as a result of rounding down each node's size + * to FAKE_NODE_MIN_SIZE. + */ + if (nodes_weight(physnode_mask) + ret >= nr_nodes) + end = physnodes[i].end; + + if (setup_node_range(ret++, &physnodes[i].start, + end - physnodes[i].start, + physnodes[i].end) < 0) + node_clear(i, physnode_mask); + } + } + return ret; +} + /* * Splits num_nodes nodes up equally starting at node_start. The return value * is the number of nodes split up and addr is adjusted to be at the end of the * last node allocated. */ -static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, +static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, int num_nodes) { unsigned int big; @@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, break; } } - if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; } return i - node_start + 1; @@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, * always assigned to a final node and can be asymmetric. Returns the number of * nodes split. */ -static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, u64 size) +static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, + u64 size) { int i = node_start; size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, nodes, addr, size, max_addr)) + while (!setup_node_range(i++, addr, size, max_addr)) ; return i - node_start; } @@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ -static struct bootnode nodes[MAX_NUMNODES] __initdata; - -static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) +static int __init numa_emulation(unsigned long start_pfn, + unsigned long last_pfn, int acpi, int k8) { u64 size, addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; + int num_phys_nodes; - memset(&nodes, 0, sizeof(nodes)); + num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); /* * If the numa=fake command-line is just a single number N, split the * system RAM into N fake nodes. @@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { long n = simple_strtol(cmdline, NULL, 0); - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); + num_nodes = split_nodes_interleave(addr, max_addr, + num_phys_nodes, n); if (num_nodes < 0) return num_nodes; goto out; @@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; if (size) for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, nodes, - &addr, size, max_addr) < 0) + if (setup_node_range(num_nodes, &addr, + size, max_addr) < 0) goto done; if (!*cmdline) break; @@ -473,7 +634,7 @@ done: if (addr < max_addr) { if (coeff_flag && coeff < 0) { /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes += split_nodes_by_size(&addr, max_addr, num_nodes, num); goto out; } @@ -482,7 +643,7 @@ done: /* Split remaining nodes into coeff chunks */ if (coeff <= 0) break; - num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes += split_nodes_equally(&addr, max_addr, num_nodes, coeff); break; case ',': @@ -490,8 +651,8 @@ done: break; default: /* Give one final node */ - setup_node_range(num_nodes, nodes, &addr, - max_addr - addr, max_addr); + setup_node_range(num_nodes, &addr, max_addr - addr, + max_addr); num_nodes++; } } @@ -505,14 +666,10 @@ out: } /* - * We need to vacate all active ranges that may have been registered by - * SRAT and set acpi_numa to -1 so that srat_disabled() always returns - * true. NUMA emulation has succeeded so we will not scan ACPI nodes. + * We need to vacate all active ranges that may have been registered for + * the e820 memory map. */ remove_all_active_ranges(); -#ifdef CONFIG_ACPI_NUMA - acpi_numa = -1; -#endif for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); @@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, last_pfn)) + if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 891cbe65b2d5..34aa438d60b6 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -468,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) for (i = 0; i < num_nodes; i++) if (fake_nodes[i].start != fake_nodes[i].end) node_set(i, nodes_parsed); - WARN_ON(!nodes_cover_memory(fake_nodes)); } static int null_slit_node_compare(int a, int b) -- cgit v1.2.2 From def3c5d0a34e4b09b3cea4435c17209ad347104d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 14:09:07 -0700 Subject: x86: use kernel_stack_pointer() in process_32.c The way to obtain a kernel-mode stack pointer from a struct pt_regs in 32-bit mode is "subtle": the stack doesn't actually contain the stack pointer, but rather the location where it would have been marks the actual previous stack frame. For clarity, use kernel_stack_pointer() instead of coding this weirdness explicitly. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf79567cdab..35e6fad73e0d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -134,7 +134,7 @@ void __show_regs(struct pt_regs *regs, int all) ss = regs->ss & 0xffff; gs = get_user_gs(regs); } else { - sp = (unsigned long) (®s->sp); + sp = kernel_stack_pointer(regs); savesegment(ss, ss); savesegment(gs, gs); } -- cgit v1.2.2 From a343c75d338aa2afaea4a2a8e40de9e67b6fb4a7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 14:11:09 -0700 Subject: x86: use kernel_stack_pointer() in dumpstack.c The way to obtain a kernel-mode stack pointer from a struct pt_regs in 32-bit mode is "subtle": the stack doesn't actually contain the stack pointer, but rather the location where it would have been marks the actual previous stack frame. For clarity, use kernel_stack_pointer() instead of coding this weirdness explicitly. Furthermore, user_mode() is only valid when the process is known to not run in V86 mode. Use the safer user_mode_vm() instead. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/dumpstack.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2d8a371d4339..b8ce165dde5d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -268,11 +268,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) show_registers(regs); #ifdef CONFIG_X86_32 - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { + if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); } printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); print_symbol("%s", regs->ip); -- cgit v1.2.2 From 5ca6c0ca5dbf105d7b0ffdae2289519982189730 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 14:12:18 -0700 Subject: x86: use kernel_stack_pointer() in kgdb.c The way to obtain a kernel-mode stack pointer from a struct pt_regs in 32-bit mode is "subtle": the stack doesn't actually contain the stack pointer, but rather the location where it would have been marks the actual previous stack frame. For clarity, use kernel_stack_pointer() instead of coding this weirdness explicitly. Signed-off-by: H. Peter Anvin Cc: Jason Wessel --- arch/x86/kernel/kgdb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8d82a77a3f3b..3310d849abd2 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -88,7 +88,6 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; - gdb_regs[GDB_SP] = (int)®s->sp; #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +100,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; #endif + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); } /** -- cgit v1.2.2 From 98272ed0d2e6509fe7dc571e77956c99bf653bb6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 14:14:10 -0700 Subject: x86: use kernel_stack_pointer() in kprobes.c The way to obtain a kernel-mode stack pointer from a struct pt_regs in 32-bit mode is "subtle": the stack doesn't actually contain the stack pointer, but rather the location where it would have been marks the actual previous stack frame. For clarity, use kernel_stack_pointer() instead of coding this weirdness explicitly. Signed-off-by: H. Peter Anvin Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu --- arch/x86/kernel/kprobes.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..2ee4fa3a3f01 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -60,19 +60,7 @@ void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#ifdef CONFIG_X86_64 -#define stack_addr(regs) ((unsigned long *)regs->sp) -#else -/* - * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs - * don't save the ss and esp registers if the CPU is already in kernel - * mode when it traps. So for kprobes, regs->sp and regs->ss are not - * the [nonexistent] saved stack pointer and ss register, but rather - * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to - * point to the top of the pre-int3 stack. - */ -#define stack_addr(regs) ((unsigned long *)®s->sp) -#endif +#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ -- cgit v1.2.2 From 7a693d3f0d10f978ebdf3082c41404ab97106567 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 13 Oct 2009 08:16:30 +0200 Subject: perf_events, x86: Fix event constraints code There was namespace overlap due to a rename i did - this caused the following build warning, reported by Stephen Rothwell against linux-next x86_64 allmodconfig: arch/x86/kernel/cpu/perf_event.c: In function 'intel_get_event_idx': arch/x86/kernel/cpu/perf_event.c:1445: warning: 'event_constraint' is used uninitialized in this function This is a real bug not just a warning: fix it by renaming the global event-constraints table pointer to 'event_constraints'. Reported-by: Stephen Rothwell Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091013144223.369d616d.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9961d845719d..2e20bca3cca1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -124,7 +124,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static const struct event_constraint *event_constraint; +static const struct event_constraint *event_constraints; /* * Not sure about some of these @@ -1442,12 +1442,12 @@ intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) const struct event_constraint *event_constraint; int i, code; - if (!event_constraint) + if (!event_constraints) goto skip; code = hwc->config & CORE_EVNTSEL_EVENT_MASK; - for_each_event_constraint(event_constraint, event_constraint) { + for_each_event_constraint(event_constraint, event_constraints) { if (code == event_constraint->code) { for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { if (!test_and_set_bit(i, cpuc->used_mask)) @@ -2047,12 +2047,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ - event_constraint = intel_p6_event_constraints; + event_constraints = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ - event_constraint = intel_p6_event_constraints; + event_constraints = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -2124,14 +2124,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); - event_constraint = intel_core_event_constraints; + event_constraints = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - event_constraint = intel_nehalem_event_constraints; + event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: -- cgit v1.2.2 From a2e2725541fad72416326798c2d7fa4dafb7d337 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 Oct 2009 23:40:10 -0700 Subject: net: Introduce recvmmsg socket syscall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Meaning receive multiple messages, reducing the number of syscalls and net stack entry/exit operations. Next patches will introduce mechanisms where protocols that want to optimize this operation will provide an unlocked_recvmsg operation. This takes into account comments made by: . Paul Moore: sock_recvmsg is called only for the first datagram, sock_recvmsg_nosec is used for the rest. . Caitlin Bestler: recvmmsg now has a struct timespec timeout, that works in the same fashion as the ppoll one. If the underlying protocol returns a datagram with MSG_OOB set, this will make recvmmsg return right away with as many datagrams (+ the OOB one) it has received so far. . Rémi Denis-Courmont & Steven Whitehouse: If we receive N < vlen datagrams and then recvmsg returns an error, recvmmsg will return the successfully received datagrams, store the error and return it in the next call. This paves the way for a subsequent optimization, sk_prot->unlocked_recvmsg, where we will be able to acquire the lock only at batch start and end, not at every underlying recvmsg call. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 74619c4f9fda..11a6c79d5f46 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -832,4 +832,5 @@ ia32_sys_call_table: .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open + .quad compat_sys_recvmmsg ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 6fb3c209a7e3..3baf379fa840 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -342,10 +342,11 @@ #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 +#define __NR_recvmmsg 337 #ifdef __KERNEL__ -#define NR_syscalls 337 +#define NR_syscalls 338 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 8d3ad0adbc68..4843f7ba754a 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev) __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 298 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) +#define __NR_recvmmsg 299 +__SYSCALL(__NR_recvmmsg, sys_recvmmsg) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 0157cd26d7cc..70c2125d55b9 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -336,3 +336,4 @@ ENTRY(sys_call_table) .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open + .long sys_recvmmsg -- cgit v1.2.2 From 8968f9d3dc23d9a1821d97c6f11e72a59382e56c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 13 Oct 2009 16:19:41 +0900 Subject: perf_event, x86, mce: Use TRACE_EVENT() for MCE logging This approach is the first baby step towards solving many of the structural problems the x86 MCE logging code is having today: - It has a private ring-buffer implementation that has a number of limitations and has been historically fragile and buggy. - It is using a quirky /dev/mcelog ioctl driven ABI that is MCE specific. /dev/mcelog is not part of any larger logging framework and hence has remained on the fringes for many years. - The MCE logging code is still very unclean partly due to its ABI limitations. Fields are being reused for multiple purposes, and the whole message structure is limited and x86 specific to begin with. All in one, the x86 tree would like to move away from this private implementation of an event logging facility to a broader framework. By using perf events we gain the following advantages: - Multiple user-space agents can access MCE events. We can have an mcelog daemon running but also a system-wide tracer capturing important events in flight-recorder mode. - Sampling support: the kernel and the user-space call-chain of MCE events can be stored and analyzed as well. This way actual patterns of bad behavior can be matched to precisely what kind of activity happened in the kernel (and/or in the app) around that moment in time. - Coupling with other hardware and software events: the PMU can track a number of other anomalies - monitoring software might chose to monitor those plus the MCE events as well - in one coherent stream of events. - Discovery of MCE sources - tracepoints are enumerated and tools can act upon the existence (or non-existence) of various channels of MCE information. - Filtering support: we just subscribe to and act upon the events we are interested in. Then even on a per event source basis there's in-kernel filter expressions available that can restrict the amount of data that hits the event channel. - Arbitrary deep per cpu buffering of events - we can buffer 32 entries or we can buffer as much as we want, as long as we have the RAM. - An NMI-safe ring-buffer implementation - mappable to user-space. - Built-in support for timestamping of events, PID markers, CPU markers, etc. - A rich ABI accessible over system call interface. Per cpu, per task and per workload monitoring of MCE events can be done this way. The ABI itself has a nice, meaningful structure. - Extensible ABI: new fields can be added without breaking tooling. New tracepoints can be added as the hardware side evolves. There's various parsers that can be used. - Lots of scheduling/buffering/batching modes of operandi for MCE events. poll() support. mmap() support. read() support. You name it. - Rich tooling support: even without any MCE specific extensions added the 'perf' tool today offers various views of MCE data: perf report, perf stat, perf trace can all be used to view logged MCE events and perhaps correlate them to certain user-space usage patterns. But it can be used directly as well, for user-space agents and policy action in mcelog, etc. With this we hope to achieve significant code cleanup and feature improvements in the MCE code, and we hope to be able to drop the /dev/mcelog facility in the end. This patch is just a plain dumb dump of mce_log() records to the tracepoints / perf events framework - a first proof of concept step. Signed-off-by: Hidetoshi Seto Cc: Huang Ying Cc: Andi Kleen LKML-Reference: <4AD42A0D.7050104@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1598a9436d0..39caea3d8bc3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,9 @@ #include "mce-internal.h" +#define CREATE_TRACE_POINTS +#include + int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -141,6 +144,9 @@ void mce_log(struct mce *mce) { unsigned next, entry; + /* Emit the trace record: */ + trace_mce_record(mce); + mce->finished = 0; wmb(); for (;;) { -- cgit v1.2.2 From 194ec34184869f0de1cf255c924fc5299e1b3d27 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 Oct 2009 16:33:50 -0400 Subject: function-graph/x86: Replace unbalanced ret with jmp The function graph tracer replaces the return address with a hook to trace the exit of the function call. This hook will finish by returning to the real location the function should return to. But the current implementation uses a ret to jump to the real return location. This causes a imbalance between calls and ret. That is the original function does a call, the ret goes to the handler and then the handler does a ret without a matching call. Although the function graph tracer itself still breaks the branch predictor by replacing the original ret, by using a second ret and causing an imbalance, it breaks the predictor even more. This patch replaces the ret with a jmp to keep the calls and ret balanced. I tested this on one box and it showed a 1.7% increase in performance. Another box only showed a small 0.3% increase. But no box that I tested this on showed a decrease in performance by making this change. Signed-off-by: Steven Rostedt Acked-by: Mathieu Desnoyers Cc: Frederic Weisbecker LKML-Reference: <20091013203425.042034383@goodmis.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 7 ++----- arch/x86/kernel/entry_64.S | 6 +++--- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c6..7d52e9da5e0c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1185,17 +1185,14 @@ END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 pushl %eax - pushl %ecx pushl %edx movl %ebp, %eax call ftrace_return_to_handler - movl %eax, 0xc(%esp) + movl %eax, %ecx popl %edx - popl %ecx popl %eax - ret + jmp *%ecx #endif .section .rodata,"a" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f8f358..bd5bbddddf91 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -155,11 +155,11 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 16(%rsp) + movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $16, %rsp - retq + addq $24, %rsp + jmp *%rdi #endif -- cgit v1.2.2 From 9844ab11c763bfed9f054c82366b19dcda66aca9 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 14 Oct 2009 00:07:03 +0400 Subject: x86, apic: Introduce the NOOP apic driver Introduce NOOP APIC driver. We should use it in case if apic was disabled due to hardware of software/firmware problems (including user requested to disable it case). The driver is attempting to catch any inappropriate apic operation call with warning issue. Also it is possible to use some apic operation like IPI calls, read/write without checking for apic presence which should make callers code easier. Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: macro@linux-mips.org LKML-Reference: <20091013201022.534682104@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 + arch/x86/kernel/apic/Makefile | 2 +- arch/x86/kernel/apic/apic_noop.c | 194 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/apic/apic_noop.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 474d80d3e6cc..08a5f420e07b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -488,6 +488,8 @@ static inline unsigned int read_apic_id(void) extern void default_setup_apic_routing(void); +extern struct apic apic_noop; + #ifdef CONFIG_X86_32 extern struct apic apic_default; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index da7b7b9f8bd8..565c1bfc507d 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,7 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c new file mode 100644 index 000000000000..0b93ec2fde0a --- /dev/null +++ b/arch/x86/kernel/apic/apic_noop.c @@ -0,0 +1,194 @@ +/* + * NOOP APIC driver. + * + * Does almost nothing and should be substituted by a real apic driver via + * probe routine. + * + * Though in case if apic is disabled (for some reason) we try + * to not uglify the caller's code and allow to call (some) apic routines + * like self-ipi, etc... and issue a warning if an operation is not allowed + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* + * some operations should never be reached with + * noop apic if it's not turned off, this mostly + * means the caller forgot to disable apic (or + * check the apic presence) before doing a call + */ +static void warn_apic_enabled(void) +{ + WARN_ONCE((cpu_has_apic || !disable_apic), + "APIC: Called for NOOP operation with apic enabled\n"); +} + +/* + * To check operations but do not bloat source code + */ +#define NOOP_FUNC(func) func { warn_apic_enabled(); } +#define NOOP_FUNC_RET(func, ret) func { warn_apic_enabled(); return ret; } + +NOOP_FUNC(static void noop_init_apic_ldr(void)) +NOOP_FUNC(static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector)) +NOOP_FUNC(static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)) +NOOP_FUNC(static void noop_send_IPI_allbutself(int vector)) +NOOP_FUNC(static void noop_send_IPI_all(int vector)) +NOOP_FUNC(static void noop_send_IPI_self(int vector)) +NOOP_FUNC_RET(static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip), -1) +NOOP_FUNC(static void noop_apic_write(u32 reg, u32 v)) +NOOP_FUNC(void noop_apic_wait_icr_idle(void)) +NOOP_FUNC_RET(static u32 noop_safe_apic_wait_icr_idle(void), 0) +NOOP_FUNC_RET(static u64 noop_apic_icr_read(void), 0) +NOOP_FUNC(static void noop_apic_icr_write(u32 low, u32 id)) +NOOP_FUNC_RET(static physid_mask_t noop_ioapic_phys_id_map(physid_mask_t phys_map), phys_map) +NOOP_FUNC_RET(static int noop_cpu_to_logical_apicid(int cpu), 1) +NOOP_FUNC_RET(static int noop_default_phys_pkg_id(int cpuid_apic, int index_msb), 0) +NOOP_FUNC_RET(static unsigned int noop_get_apic_id(unsigned long x), 0) + +static int noop_probe(void) +{ + /* should not ever be enabled this way */ + return 0; +} + +static int noop_apic_id_registered(void) +{ + warn_apic_enabled(); + return physid_isset(read_apic_id(), phys_cpu_present_map); +} + +static const struct cpumask *noop_target_cpus(void) +{ + warn_apic_enabled(); + + /* only BSP here */ + return cpumask_of(0); +} + +static unsigned long noop_check_apicid_used(physid_mask_t bitmap, int apicid) +{ + warn_apic_enabled(); + return physid_isset(apicid, bitmap); +} + +static unsigned long noop_check_apicid_present(int bit) +{ + warn_apic_enabled(); + return physid_isset(bit, phys_cpu_present_map); +} + +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + warn_apic_enabled(); + if (cpu != 0) + pr_warning("APIC: Vector allocated for non-BSP cpu\n"); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +int noop_apicid_to_node(int logical_apicid) +{ + warn_apic_enabled(); + + /* we're always on node 0 */ + return 0; +} + +static u32 noop_apic_read(u32 reg) +{ + /* + * noop-read is always safe until we have + * non-disabled unit + */ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +struct apic apic_noop = { + .name = "noop", + .probe = noop_probe, + .acpi_madt_oem_check = NULL, + + .apic_id_registered = noop_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = noop_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = noop_check_apicid_used, + .check_apicid_present = noop_check_apicid_present, + + .vector_allocation_domain = noop_vector_allocation_domain, + .init_apic_ldr = noop_init_apic_ldr, + + .ioapic_phys_id_map = noop_ioapic_phys_id_map, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .apicid_to_node = noop_apicid_to_node, + + .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = default_apicid_to_cpu_present, + + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + + .phys_pkg_id = noop_default_phys_pkg_id, + + .mps_oem_check = NULL, + + .get_apic_id = noop_get_apic_id, + .set_apic_id = NULL, + .apic_id_mask = 0x0F << 24, + + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = noop_send_IPI_mask, + .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, + .send_IPI_allbutself = noop_send_IPI_allbutself, + .send_IPI_all = noop_send_IPI_all, + .send_IPI_self = noop_send_IPI_self, + + .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, + + /* should be safe */ + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + + .wait_for_init_deassert = NULL, + + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = noop_apic_read, + .write = noop_apic_write, + .icr_read = noop_apic_icr_read, + .icr_write = noop_apic_icr_write, + .wait_icr_idle = noop_apic_wait_icr_idle, + .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, +}; -- cgit v1.2.2 From a933c61829509eb27083146dda392132baa0969a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 14 Oct 2009 00:07:04 +0400 Subject: x86, apic: Use apic noop driver In case if apic were disabled we may use the whole apic NOOP driver instead of sparse poking the some functions in apic driver. Also NOOP would catch any inappropriate apic operation calls (not just read/write). Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: macro@linux-mips.org LKML-Reference: <20091013201022.747817361@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 894aa97f0717..61a5628810da 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -241,28 +241,12 @@ static int modern_apic(void) } /* - * bare function to substitute write operation - * and it's _that_ fast :) - */ -static void native_apic_write_dummy(u32 reg, u32 v) -{ - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); -} - -static u32 native_apic_read_dummy(u32 reg) -{ - WARN_ON_ONCE((cpu_has_apic && !disable_apic)); - return 0; -} - -/* - * right after this call apic->write/read doesn't do anything - * note that there is no restore operation it works one way + * right after this call apic become NOOP driven + * so apic->write/read doesn't do anything */ void apic_disable(void) { - apic->read = native_apic_read_dummy; - apic->write = native_apic_write_dummy; + apic = &apic_noop; } void native_apic_wait_icr_idle(void) -- cgit v1.2.2 From 2626eb2b2fd958dc0f683126aa84e93b939699a1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 14 Oct 2009 00:07:05 +0400 Subject: x86, apic: Limit apic dumping, introduce new show_lapic= setup option In case if a system has a large number of cpus printing apics contents may consume a long time period. We limit such an output by 1 apic by default. But to have an ability to see all apics or some part of them we introduce "show_lapic" setup option which allow us to limit/unlimit the number of APICs being dumped. Example: apic=debug show_lapic=5, or apic=debug show_lapic=all Also move apic_verbosity checking upper that way so helper routines do not need to inspect it at all. Suggested-by: Yinghai Lu Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: macro@linux-mips.org LKML-Reference: <20091013201022.926793122@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 47 ++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index dc69f28489f5..8c718c93d079 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1599,9 +1599,6 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_desc *desc; unsigned int irq; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", @@ -1708,9 +1705,6 @@ __apicdebuginit(void) print_APIC_field(int base) { int i; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG); for (i = 0; i < 8; i++) @@ -1724,9 +1718,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy) unsigned int i, v, ver, maxlvt; u64 icr; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); @@ -1824,13 +1815,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk("\n"); } -__apicdebuginit(void) print_all_local_APICs(void) +__apicdebuginit(void) print_local_APICs(int maxcpu) { int cpu; + if (!maxcpu) + return; + preempt_disable(); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } preempt_enable(); } @@ -1839,7 +1836,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) + if (!nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1866,21 +1863,41 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -__apicdebuginit(int) print_all_ICs(void) +static int __initdata show_lapic = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +__apicdebuginit(int) print_ICs(void) { + if (apic_verbosity == APIC_QUIET) + return 0; + print_PIC(); /* don't print out if apic is not there */ if (!cpu_has_apic && !apic_from_smp_config()) return 0; - print_all_local_APICs(); + print_local_APICs(show_lapic); print_IO_APIC(); return 0; } -fs_initcall(print_all_ICs); +fs_initcall(print_ICs); /* Where if anywhere is the i8259 connect in external int mode */ -- cgit v1.2.2 From 6c2c502910247d2820cb630e7b28fb6bdecdbf45 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 30 Sep 2009 11:02:59 -0500 Subject: x86: SGI UV: Fix irq affinity for hub based interrupts This patch fixes handling of uv hub irq affinity. IRQs with ALL or NODE affinity can be routed to cpus other than their originally assigned cpu. Those with CPU affinity cannot be rerouted. Signed-off-by: Dimitri Sivanich LKML-Reference: <20090930160259.GA7822@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_irq.h | 15 ++++- arch/x86/kernel/apic/io_apic.c | 49 +++++++++++++-- arch/x86/kernel/uv_irq.c | 128 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 177 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h index 9613c8c0b647..5397e1290952 100644 --- a/arch/x86/include/asm/uv/uv_irq.h +++ b/arch/x86/include/asm/uv/uv_irq.h @@ -25,12 +25,21 @@ struct uv_IO_APIC_route_entry { dest : 32; }; +enum { + UV_AFFINITY_ALL, + UV_AFFINITY_NODE, + UV_AFFINITY_CPU +}; + extern struct irq_chip uv_irq_chip; -extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); +extern int +arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long, int); extern void arch_disable_uv_irq(int, unsigned long); +extern int uv_set_irq_affinity(unsigned int, const struct cpumask *); -extern int uv_setup_irq(char *, int, int, unsigned long); -extern void uv_teardown_irq(unsigned int, int, unsigned long); +extern int uv_irq_2_mmr_info(int, unsigned long *, int *); +extern int uv_setup_irq(char *, int, int, unsigned long, int); +extern void uv_teardown_irq(unsigned int); #endif /* _ASM_X86_UV_UV_IRQ_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8c718c93d079..bb52e7f6e953 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3731,9 +3731,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) * on the specified blade to allow the sending of MSIs to the specified CPU. */ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset) + unsigned long mmr_offset, int restrict) { const struct cpumask *eligible_cpu = cpumask_of(cpu); + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; int mmr_pnode; unsigned long mmr_value; @@ -3749,6 +3750,11 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; + if (restrict == UV_AFFINITY_CPU) + desc->status |= IRQ_NO_BALANCING; + else + desc->status |= IRQ_MOVE_PCNTXT; + spin_lock_irqsave(&vector_lock, flags); set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, irq_name); @@ -3777,11 +3783,10 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, * Disable the specified MMR located on the specified blade so that MSIs are * longer allowed to be sent. */ -void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) +void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) { unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -3789,9 +3794,45 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->mask = 1; - mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); } + +int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + unsigned int dest; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long mmr_offset; + unsigned mmr_pnode; + + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) + return -1; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = dest; + + /* Get previously stored MMR and pnode of hub sourcing interrupts */ + if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + return -1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return 0; +} #endif /* CONFIG_X86_64 */ int __init io_apic_get_redir_entries (int ioapic) diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index aeef529917e4..9a83775ab0f3 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -9,10 +9,22 @@ */ #include +#include #include #include #include +#include + +/* MMR offset and pnode of hub sourcing interrupts for a given irq */ +struct uv_irq_2_mmr_pnode{ + struct rb_node list; + unsigned long offset; + int pnode; + int irq; +}; +static spinlock_t uv_irq_lock; +static struct rb_root uv_irq_root; static void uv_noop(unsigned int irq) { @@ -39,25 +51,106 @@ struct irq_chip uv_irq_chip = { .unmask = uv_noop, .eoi = uv_ack_apic, .end = uv_noop, + .set_affinity = uv_set_irq_affinity, }; +/* + * Add offset and pnode information of the hub sourcing interrupts to the + * rb tree for a specific irq. + */ +static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +{ + struct rb_node **link = &uv_irq_root.rb_node; + struct rb_node *parent = NULL; + struct uv_irq_2_mmr_pnode *n; + struct uv_irq_2_mmr_pnode *e; + unsigned long irqflags; + + n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, + uv_blade_to_memory_nid(blade)); + if (!n) + return -ENOMEM; + + n->irq = irq; + n->offset = offset; + n->pnode = uv_blade_to_pnode(blade); + spin_lock_irqsave(&uv_irq_lock, irqflags); + /* Find the right place in the rbtree: */ + while (*link) { + parent = *link; + e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); + + if (unlikely(irq == e->irq)) { + /* irq entry exists */ + e->pnode = uv_blade_to_pnode(blade); + e->offset = offset; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + kfree(n); + return 0; + } + + if (irq < e->irq) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + /* Insert the node into the rbtree. */ + rb_link_node(&n->list, parent, link); + rb_insert_color(&n->list, &uv_irq_root); + + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; +} + +/* Retrieve offset and pnode information from the rb tree for a specific irq */ +int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +{ + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + + if (e->irq == irq) { + *offset = e->offset; + *pnode = e->pnode; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; + } + + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return -1; +} + /* * Set up a mapping of an available irq and vector, and enable the specified * MMR that defines the MSI that is to be sent to the specified CPU when an * interrupt is raised. */ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset) + unsigned long mmr_offset, int restrict) { - int irq; - int ret; + int irq, ret; + + irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - irq = create_irq(); if (irq <= 0) return -EBUSY; - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); - if (ret != irq) + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, + restrict); + if (ret == irq) + uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); + else destroy_irq(irq); return ret; @@ -71,9 +164,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); * * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). */ -void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +void uv_teardown_irq(unsigned int irq) { - arch_disable_uv_irq(mmr_blade, mmr_offset); + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + if (e->irq == irq) { + arch_disable_uv_irq(e->pnode, e->offset); + rb_erase(n, &uv_irq_root); + kfree(e); + break; + } + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); destroy_irq(irq); } EXPORT_SYMBOL_GPL(uv_teardown_irq); -- cgit v1.2.2 From 9338ad6ffb70eca97f335d93c54943828c8b209e Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 13 Oct 2009 15:32:36 -0500 Subject: x86, apic: Move SGI UV functionality out of generic IO-APIC code Move UV specific functionality out of the generic IO-APIC code. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091013203236.GD20543@sgi.com> [ Cleaned up the code some more in their new places. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 29 ++++++-- arch/x86/include/asm/uv/uv_irq.h | 7 -- arch/x86/kernel/apic/io_apic.c | 140 ++------------------------------------- arch/x86/kernel/uv_irq.c | 123 ++++++++++++++++++++++++++++++++-- 4 files changed, 145 insertions(+), 154 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index ba180d93b08c..56f0877c9329 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -79,14 +79,31 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, int ioapic, int ioapic_pin, int trigger, int polarity) { - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; + irq_attr->ioapic = ioapic; + irq_attr->ioapic_pin = ioapic_pin; + irq_attr->trigger = trigger; + irq_attr->polarity = polarity; } -extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, - struct io_apic_irq_attr *irq_attr); +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ +struct irq_cfg { + struct irq_pin_list *irq_2_pin; + cpumask_var_t domain; + cpumask_var_t old_domain; + unsigned move_cleanup_count; + u8 vector; + u8 move_in_progress : 1; +}; + +extern struct irq_cfg *irq_cfg(unsigned int); +extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); +extern void send_cleanup_vector(struct irq_cfg *); +extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *); +extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); extern void enable_IO_APIC(void); diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h index 5397e1290952..d6b17c760622 100644 --- a/arch/x86/include/asm/uv/uv_irq.h +++ b/arch/x86/include/asm/uv/uv_irq.h @@ -31,13 +31,6 @@ enum { UV_AFFINITY_CPU }; -extern struct irq_chip uv_irq_chip; - -extern int -arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long, int); -extern void arch_disable_uv_irq(int, unsigned long); -extern int uv_set_irq_affinity(unsigned int, const struct cpumask *); - extern int uv_irq_2_mmr_info(int, unsigned long *, int *); extern int uv_setup_irq(char *, int, int, unsigned long, int); extern void uv_teardown_irq(unsigned int); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index bb52e7f6e953..ce16b65cfdcc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -60,8 +60,6 @@ #include #include #include -#include -#include #include @@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } -/* - * This is performance-critical, we want to do it O(1) - * - * Most irqs are mapped 1:1 with pins. - */ -struct irq_cfg { - struct irq_pin_list *irq_2_pin; - cpumask_var_t domain; - cpumask_var_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[] = { @@ -209,7 +193,7 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg = NULL; struct irq_desc *desc; @@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) /* end for move_irq_desc */ #else -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { return irq < nr_irqs ? irq_cfgx + irq : NULL; } @@ -1237,8 +1221,7 @@ next: return err; } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@ -2245,7 +2228,7 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) +void send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; @@ -2289,15 +2272,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - /* * Either sets desc->affinity to a valid value, and returns * ->cpu_mask_to_apicid of that, or returns BAD_APICID and * leaves desc->affinity untouched. */ -static unsigned int +unsigned int set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; @@ -3725,116 +3705,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -#ifdef CONFIG_X86_UV -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset, int restrict) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; - int mmr_pnode; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long flags; - int err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - cfg = irq_cfg(irq); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - if (restrict == UV_AFFINITY_CPU) - desc->status |= IRQ_NO_BALANCING; - else - desc->status |= IRQ_MOVE_PCNTXT; - - spin_lock_irqsave(&vector_lock, flags); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - spin_unlock_irqrestore(&vector_lock, flags); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} - -int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = desc->chip_data; - unsigned int dest; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long mmr_offset; - unsigned mmr_pnode; - - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) - return -1; - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - /* Get previously stored MMR and pnode of hub sourcing interrupts */ - if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) - return -1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return 0; -} -#endif /* CONFIG_X86_64 */ - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 9a83775ab0f3..61d805df4c91 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -18,13 +18,16 @@ /* MMR offset and pnode of hub sourcing interrupts for a given irq */ struct uv_irq_2_mmr_pnode{ - struct rb_node list; - unsigned long offset; - int pnode; - int irq; + struct rb_node list; + unsigned long offset; + int pnode; + int irq; }; -static spinlock_t uv_irq_lock; -static struct rb_root uv_irq_root; + +static spinlock_t uv_irq_lock; +static struct rb_root uv_irq_root; + +static int uv_set_irq_affinity(unsigned int, const struct cpumask *); static void uv_noop(unsigned int irq) { @@ -131,6 +134,114 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) return -1; } +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +static int +arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset, int restrict) +{ + const struct cpumask *eligible_cpu = cpumask_of(cpu); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int err; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); + if (err != 0) + return err; + + if (restrict == UV_AFFINITY_CPU) + desc->status |= IRQ_NO_BALANCING; + else + desc->status |= IRQ_MOVE_PCNTXT; + + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->mask = 1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} + +static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + unsigned int dest; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long mmr_offset; + unsigned mmr_pnode; + + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) + return -1; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = dest; + + /* Get previously stored MMR and pnode of hub sourcing interrupts */ + if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + return -1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return 0; +} + /* * Set up a mapping of an available irq and vector, and enable the specified * MMR that defines the MSI that is to be sent to the specified CPU when an -- cgit v1.2.2 From c44fc770845163f8d9e573f37f92a7b7a7ade14e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 19 Sep 2009 06:50:42 +0200 Subject: tracing: Move syscalls metadata handling from arch to core Most of the syscalls metadata processing is done from arch. But these operations are mostly generic accross archs. Especially now that we have a common variable name that expresses the number of syscalls supported by an arch: NR_syscalls, the only remaining bits that need to reside in arch is the syscall nr to addr translation. v2: Compare syscalls symbols only after the "sys" prefix so that we avoid spurious mismatches with archs that have syscalls wrappers, in which case syscalls symbols have "SyS" prefixed aliases. (Reported by: Heiko Carstens) Signed-off-by: Frederic Weisbecker Acked-by: Heiko Carstens Cc: Ingo Molnar Cc: Steven Rostedt Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Paul Mundt --- arch/x86/kernel/ftrace.c | 76 ++---------------------------------------------- 1 file changed, 2 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 25e6f5fc4b1e..5a1b9758fd62 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -470,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; extern unsigned long *sys_call_table; -static struct syscall_metadata **syscalls_metadata; - -static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) -{ - struct syscall_metadata *start; - struct syscall_metadata *stop; - char str[KSYM_SYMBOL_LEN]; - - - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; - kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); - - for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name, str)) - return start; - } - return NULL; -} - -struct syscall_metadata *syscall_nr_to_meta(int nr) -{ - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) - return NULL; - - return syscalls_metadata[nr]; -} - -int syscall_name_to_nr(char *name) -{ - int i; - - if (!syscalls_metadata) - return -1; - - for (i = 0; i < NR_syscalls; i++) { - if (syscalls_metadata[i]) { - if (!strcmp(syscalls_metadata[i]->name, name)) - return i; - } - } - return -1; -} - -void set_syscall_enter_id(int num, int id) -{ - syscalls_metadata[num]->enter_id = id; -} - -void set_syscall_exit_id(int num, int id) +unsigned long __init arch_syscall_addr(int nr) { - syscalls_metadata[num]->exit_id = id; -} - -static int __init arch_init_ftrace_syscalls(void) -{ - int i; - struct syscall_metadata *meta; - unsigned long **psys_syscall_table = &sys_call_table; - - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return -ENOMEM; - } - - for (i = 0; i < NR_syscalls; i++) { - meta = find_syscall_meta(psys_syscall_table[i]); - syscalls_metadata[i] = meta; - } - return 0; + return (unsigned long)(&sys_call_table)[nr]; } -arch_initcall(arch_init_ftrace_syscalls); #endif -- cgit v1.2.2 From 7ec13187ef48b04bb7f6dfa266c7271a52d009c2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 14 Oct 2009 15:06:42 +0200 Subject: x86, apic: Fix prototype in hw_irq.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This warning: In file included from arch/x86/include/asm/ipi.h:23, from arch/x86/kernel/apic/apic_noop.c:27: arch/x86/include/asm/hw_irq.h:105: warning: ‘struct irq_desc’ declared inside parameter list arch/x86/include/asm/hw_irq.h:105: warning: its scope is only this definition or declaration, which is probably not what you want triggers because irq_desc is defined after hw_irq.h is included in irq.h. Since it's pointer reference only, a forward declaration of the type will solve the problem. LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 56f0877c9329..1984ce9a13d2 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -102,6 +102,8 @@ struct irq_cfg { extern struct irq_cfg *irq_cfg(unsigned int); extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); extern void send_cleanup_vector(struct irq_cfg *); + +struct irq_desc; extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *); extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); -- cgit v1.2.2 From ac06ea2cd06291e63951b51dd7c9a23e6a1f2683 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 10 Oct 2009 09:35:48 +0200 Subject: x86: Remove BKL from microcode cycle_lock_kernel() in microcode_open() is a worthless exercise as there is nothing to wait for. Remove it. Signed-off-by: Thomas Gleixner LKML-Reference: <20091010153349.196074920@linutronix.de> --- arch/x86/kernel/microcode_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 378e9a8f1bf8..2bcad3926edb 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -73,7 +73,6 @@ #include #include #include -#include #include #include #include @@ -201,7 +200,6 @@ static int do_microcode_update(const void __user *buf, size_t size) static int microcode_open(struct inode *unused1, struct file *unused2) { - cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } -- cgit v1.2.2 From 05d86412eab6a18cf57697474cc4f8fbfcd6936f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Oct 2009 19:02:20 +0200 Subject: x86: Remove BKL from apm_32 The lock/unlock kernel pair in do_open() got there with the BKL push down and protects nothing. Remove it. Replace the lock/unlock kernel in the ioctl code with a mutex to protect standbys_pending and suspends_pending. Signed-off-by: Thomas Gleixner LKML-Reference: <20091010153349.365236337@linutronix.de> --- arch/x86/kernel/apm_32.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 151ace69a5aa..b5b6b23bce53 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -204,7 +204,6 @@ #include #include -#include #include #include #include @@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); +static DEFINE_MUTEX(apm_mutex); /* * Set up a segment that references the real mode segment 0x40 @@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) return -EPERM; switch (cmd) { case APM_IOC_STANDBY: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->standbys_read > 0) { as->standbys_read--; as->standbys_pending--; @@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_STANDBY, as); if (standbys_pending <= 0) standby(); - unlock_kernel(); + mutex_unlock(&apm_mutex); break; case APM_IOC_SUSPEND: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->suspends_read > 0) { as->suspends_read--; as->suspends_pending--; @@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_SUSPEND, as); if (suspends_pending <= 0) { ret = suspend(1); + mutex_unlock(&apm_mutex); } else { as->suspend_wait = 1; + mutex_unlock(&apm_mutex); wait_event_interruptible(apm_suspend_waitqueue, as->suspend_wait == 0); ret = as->suspend_result; } - unlock_kernel(); return ret; default: return -ENOTTY; @@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp) { struct apm_user *as; - lock_kernel(); as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", sizeof(*as)); - unlock_kernel(); return -ENOMEM; } as->magic = APM_BIOS_MAGIC; @@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp) user_list = as; spin_unlock(&user_list_lock); filp->private_data = as; - unlock_kernel(); return 0; } -- cgit v1.2.2 From e47938b1faaf9e9041ae842a878901001ce20ea1 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 14 Oct 2009 09:16:30 -0500 Subject: x86: UV RTC: Fix early expiry handling Tune/fix early timer expiry handling and return correct early timeout value for set_next_event. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091014141630.GB11048@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 583f11d5c480..ec14889628e0 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires) /* Initialize comparator value */ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); - return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); + if (uv_read_rtc(NULL) <= expires) + return 0; + + return !uv_intr_pending(pnode); } /* @@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) next_cpu = head->next_cpu; *t = expires; + /* Will this one be next to go off? */ if (next_cpu < 0 || bcpu == next_cpu || expires < head->cpu[next_cpu].expires) { @@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) *t = ULLONG_MAX; uv_rtc_find_next_timer(head, pnode); spin_unlock_irqrestore(&head->lock, flags); - return 1; + return -ETIME; } } @@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) * * Returns 1 if this timer was pending. */ -static int uv_rtc_unset_timer(int cpu) +static int uv_rtc_unset_timer(int cpu, int force) { int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); @@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu) spin_lock_irqsave(&head->lock, flags); - if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) + if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force) rc = 1; - *t = ULLONG_MAX; - - /* Was the hardware setup for this timer? */ - if (head->next_cpu == bcpu) - uv_rtc_find_next_timer(head, pnode); + if (rc) { + *t = ULLONG_MAX; + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + } spin_unlock_irqrestore(&head->lock, flags); @@ -310,20 +315,20 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - uv_rtc_unset_timer(ced_cpu); + uv_rtc_unset_timer(ced_cpu, 1); break; } } static void uv_rtc_interrupt(void) { - struct clock_event_device *ced = &__get_cpu_var(cpu_ced); int cpu = smp_processor_id(); + struct clock_event_device *ced = &per_cpu(cpu_ced, cpu); if (!ced || !ced->event_handler) return; - if (uv_rtc_unset_timer(cpu) != 1) + if (uv_rtc_unset_timer(cpu, 0) != 1) return; ced->event_handler(ced); -- cgit v1.2.2 From 8c28de4d011f37b2893ecfcec9a985c0e9bd786f Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 14 Oct 2009 09:18:48 -0500 Subject: x86: UV RTC: Add clocksource only boot option Add clocksource only boot option for UV RTC. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091014141848.GC11048@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index ec14889628e0..c6324ad7c0d9 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -75,6 +75,7 @@ struct uv_rtc_timer_head { static struct uv_rtc_timer_head **blade_info __read_mostly; static int uv_rtc_enable; +static int uv_rtc_evt_enable; /* * Hardware interface routines @@ -342,6 +343,14 @@ static int __init uv_enable_rtc(char *str) } __setup("uvrtc", uv_enable_rtc); +static int __init uv_enable_evt_rtc(char *str) +{ + uv_rtc_evt_enable = 1; + + return 1; +} +__setup("uvrtcevt", uv_enable_evt_rtc); + static __init void uv_rtc_register_clockevents(struct work_struct *dummy) { struct clock_event_device *ced = &__get_cpu_var(cpu_ced); @@ -358,16 +367,20 @@ static __init int uv_rtc_setup_clock(void) if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) return -ENODEV; - generic_interrupt_extension = uv_rtc_interrupt; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, clocksource_uv.shift); rc = clocksource_register(&clocksource_uv); - if (rc) { - generic_interrupt_extension = NULL; + if (rc) + printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); + else + printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", + sn_rtc_cycles_per_second/(unsigned long)1E6); + + if (rc || !uv_rtc_evt_enable) return rc; - } + + generic_interrupt_extension = uv_rtc_interrupt; /* Setup and register clockevents */ rc = uv_rtc_allocate_timers(); -- cgit v1.2.2 From d5991ff297ad2f7e2698eefcd8269df5ecec150f Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 14 Oct 2009 09:21:03 -0500 Subject: x86: UV RTC: Clean up error handling Cleanup error handling in uv_rtc_setup_clock. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091014142103.GD11048@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index c6324ad7c0d9..255645084534 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -380,15 +380,12 @@ static __init int uv_rtc_setup_clock(void) if (rc || !uv_rtc_evt_enable) return rc; - generic_interrupt_extension = uv_rtc_interrupt; - /* Setup and register clockevents */ rc = uv_rtc_allocate_timers(); - if (rc) { - clocksource_unregister(&clocksource_uv); - generic_interrupt_extension = NULL; - return rc; - } + if (rc) + goto error; + + generic_interrupt_extension = uv_rtc_interrupt; clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, NSEC_PER_SEC, clock_event_device_uv.shift); @@ -401,11 +398,19 @@ static __init int uv_rtc_setup_clock(void) rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { - clocksource_unregister(&clocksource_uv); generic_interrupt_extension = NULL; uv_rtc_deallocate_timers(); + goto error; } + printk(KERN_INFO "UV RTC clockevents registered\n"); + + return 0; + +error: + clocksource_unregister(&clocksource_uv); + printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc); + return rc; } arch_initcall(uv_rtc_setup_clock); -- cgit v1.2.2 From 4a4de9c7d7111ce4caf422b856756125d8304f9d Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 14 Oct 2009 09:22:57 -0500 Subject: x86: UV RTC: Rename generic_interrupt to x86_platform_ipi Signed-off-by: Dimitri Sivanich LKML-Reference: <20091014142257.GE11048@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/include/asm/hardirq.h | 2 +- arch/x86/include/asm/hw_irq.h | 4 ++-- arch/x86/include/asm/irq.h | 2 +- arch/x86/include/asm/irq_vectors.h | 2 +- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/irq.c | 20 ++++++++++---------- arch/x86/kernel/irqinit.c | 4 ++-- arch/x86/kernel/uv_time.c | 10 +++++----- 9 files changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index f5693c81a1db..8e8ec663a98f 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -34,7 +34,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, smp_invalidate_interrupt) #endif -BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) /* * every pentium local APIC has two 'local interrupts', with a diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 82e3e8f01043..beaabd794a10 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -12,7 +12,7 @@ typedef struct { unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; #endif - unsigned int generic_irqs; /* arch dependent */ + unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; unsigned int apic_pending_irqs; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index ba180d93b08c..95207ca5c6f1 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -27,7 +27,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); -extern void generic_interrupt(void); +extern void x86_platform_ipi(void); extern void error_interrupt(void); extern void perf_pending_interrupt(void); @@ -101,7 +101,7 @@ extern void eisa_set_level_irq(unsigned int irq); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); -extern void smp_generic_interrupt(struct pt_regs *); +extern void smp_x86_platform_ipi(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); #ifdef CONFIG_X86_IO_APIC extern asmlinkage void smp_irq_move_cleanup_interrupt(void); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index ddda6cbed6f4..fcbc6d144501 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -36,7 +36,7 @@ static inline int irq_canonicalize(int irq) extern void fixup_irqs(void); #endif -extern void (*generic_interrupt_extension)(void); +extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 5b21f0ec3df2..6a635bd39867 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -106,7 +106,7 @@ /* * Generic system vector for platform specific use */ -#define GENERIC_INTERRUPT_VECTOR 0xed +#define X86_PLATFORM_IPI_VECTOR 0xed /* * Performance monitoring pending work vector: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f8f358..6714432ef381 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -969,8 +969,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt GENERIC_INTERRUPT_VECTOR \ - generic_interrupt smp_generic_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 391206199515..9375dce39f5f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -18,7 +18,7 @@ atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ -void (*generic_interrupt_extension)(void) = NULL; +void (*x86_platform_ipi_callback)(void) = NULL; /* * 'what should we do if we get a hw irq event on an illegal vector'. @@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); seq_printf(p, " Performance pending work\n"); #endif - if (generic_interrupt_extension) { + if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_printf(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP @@ -187,8 +187,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_pending_irqs; #endif - if (generic_interrupt_extension) - sum += irq_stats(cpu)->generic_irqs; + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -252,9 +252,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } /* - * Handler for GENERIC_INTERRUPT_VECTOR. + * Handler for X86_PLATFORM_IPI_VECTOR. */ -void smp_generic_interrupt(struct pt_regs *regs) +void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -264,10 +264,10 @@ void smp_generic_interrupt(struct pt_regs *regs) irq_enter(); - inc_irq_stat(generic_irqs); + inc_irq_stat(x86_platform_ipis); - if (generic_interrupt_extension) - generic_interrupt_extension(); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); run_local_timers(); irq_exit(); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 40f30773fb29..d5932226614f 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -200,8 +200,8 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI for X86 platform specific use */ + alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 255645084534..3da7b1d8bfd3 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -91,7 +91,7 @@ static void uv_rtc_send_IPI(int cpu) pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | - (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } @@ -116,7 +116,7 @@ static int uv_setup_intr(int cpu, u64 expires) uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, UVH_EVENT_OCCURRED0_RTC1_MASK); - val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); /* Set configuration */ @@ -364,7 +364,7 @@ static __init int uv_rtc_setup_clock(void) { int rc; - if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + if (!uv_rtc_enable || !is_uv_system() || x86_platform_ipi_callback) return -ENODEV; clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, @@ -385,7 +385,7 @@ static __init int uv_rtc_setup_clock(void) if (rc) goto error; - generic_interrupt_extension = uv_rtc_interrupt; + x86_platform_ipi_callback = uv_rtc_interrupt; clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, NSEC_PER_SEC, clock_event_device_uv.shift); @@ -398,7 +398,7 @@ static __init int uv_rtc_setup_clock(void) rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { - generic_interrupt_extension = NULL; + x86_platform_ipi_callback = NULL; uv_rtc_deallocate_timers(); goto error; } -- cgit v1.2.2 From f88f2b4fdb1e098433ad2b005b6f7353f7268ce1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 15 Oct 2009 19:04:16 +0400 Subject: x86: apic: Allow noop operations to be called almost at any time As only apic noop is used we allow to use almost any operation caller wants (and which of them noop driver supports of course). Initially it was reported by Ingo Molnar that apic noop issue a warning for pkg id (which is actually false positive and should be eliminated). So we save checking (and warning issue) for read/write operations while allow any other ops to be freely used. Also: - fix noop_cpu_to_logical_apicid, it should be 0. - rename noop_default_phys_pkg_id to noop_phys_pkg_id (we use default_ prefix for more general routines in apic subsystem). Reported-by: Ingo Molnar Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Maciej W. Rozycki LKML-Reference: <20091015150416.GC5331@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + arch/x86/kernel/apic/apic_noop.c | 105 +++++++++++++++++++++------------------ 2 files changed, 59 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 61a5628810da..dce93d4b0eaf 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -246,6 +246,7 @@ static int modern_apic(void) */ void apic_disable(void) { + pr_info("APIC: switched to apic NOOP\n"); apic = &apic_noop; } diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 0b93ec2fde0a..9ab6ffb313ac 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -6,7 +6,7 @@ * * Though in case if apic is disabled (for some reason) we try * to not uglify the caller's code and allow to call (some) apic routines - * like self-ipi, etc... and issue a warning if an operation is not allowed + * like self-ipi, etc... */ #include @@ -30,76 +30,88 @@ #include #include -/* - * some operations should never be reached with - * noop apic if it's not turned off, this mostly - * means the caller forgot to disable apic (or - * check the apic presence) before doing a call - */ -static void warn_apic_enabled(void) +static void noop_init_apic_ldr(void) { } +static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_allbutself(int vector) { } +static void noop_send_IPI_all(int vector) { } +static void noop_send_IPI_self(int vector) { } +static void noop_apic_wait_icr_idle(void) { } +static void noop_apic_icr_write(u32 low, u32 id) { } + +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { - WARN_ONCE((cpu_has_apic || !disable_apic), - "APIC: Called for NOOP operation with apic enabled\n"); + return -1; } -/* - * To check operations but do not bloat source code - */ -#define NOOP_FUNC(func) func { warn_apic_enabled(); } -#define NOOP_FUNC_RET(func, ret) func { warn_apic_enabled(); return ret; } - -NOOP_FUNC(static void noop_init_apic_ldr(void)) -NOOP_FUNC(static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector)) -NOOP_FUNC(static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)) -NOOP_FUNC(static void noop_send_IPI_allbutself(int vector)) -NOOP_FUNC(static void noop_send_IPI_all(int vector)) -NOOP_FUNC(static void noop_send_IPI_self(int vector)) -NOOP_FUNC_RET(static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip), -1) -NOOP_FUNC(static void noop_apic_write(u32 reg, u32 v)) -NOOP_FUNC(void noop_apic_wait_icr_idle(void)) -NOOP_FUNC_RET(static u32 noop_safe_apic_wait_icr_idle(void), 0) -NOOP_FUNC_RET(static u64 noop_apic_icr_read(void), 0) -NOOP_FUNC(static void noop_apic_icr_write(u32 low, u32 id)) -NOOP_FUNC_RET(static physid_mask_t noop_ioapic_phys_id_map(physid_mask_t phys_map), phys_map) -NOOP_FUNC_RET(static int noop_cpu_to_logical_apicid(int cpu), 1) -NOOP_FUNC_RET(static int noop_default_phys_pkg_id(int cpuid_apic, int index_msb), 0) -NOOP_FUNC_RET(static unsigned int noop_get_apic_id(unsigned long x), 0) +static u32 noop_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static u64 noop_apic_icr_read(void) +{ + return 0; +} + +static physid_mask_t noop_ioapic_phys_id_map(physid_mask_t phys_map) +{ + return phys_map; +} + +static int noop_cpu_to_logical_apicid(int cpu) +{ + return 0; +} + +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return 0; +} + +static unsigned int noop_get_apic_id(unsigned long x) +{ + return 0; +} static int noop_probe(void) { - /* should not ever be enabled this way */ + /* + * NOOP apic should not ever be + * enabled via probe routine + */ return 0; } static int noop_apic_id_registered(void) { - warn_apic_enabled(); - return physid_isset(read_apic_id(), phys_cpu_present_map); + /* + * if we would be really "pedantic" + * we should pass read_apic_id() here + * but since NOOP suppose APIC ID = 0 + * lets save a few cycles + */ + return physid_isset(0, phys_cpu_present_map); } static const struct cpumask *noop_target_cpus(void) { - warn_apic_enabled(); - /* only BSP here */ return cpumask_of(0); } static unsigned long noop_check_apicid_used(physid_mask_t bitmap, int apicid) { - warn_apic_enabled(); return physid_isset(apicid, bitmap); } static unsigned long noop_check_apicid_present(int bit) { - warn_apic_enabled(); return physid_isset(bit, phys_cpu_present_map); } static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) { - warn_apic_enabled(); if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); cpumask_clear(retmask); @@ -108,22 +120,21 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) int noop_apicid_to_node(int logical_apicid) { - warn_apic_enabled(); - /* we're always on node 0 */ return 0; } static u32 noop_apic_read(u32 reg) { - /* - * noop-read is always safe until we have - * non-disabled unit - */ WARN_ON_ONCE((cpu_has_apic && !disable_apic)); return 0; } +static void noop_apic_write(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + struct apic apic_noop = { .name = "noop", .probe = noop_probe, @@ -157,7 +168,7 @@ struct apic apic_noop = { .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, - .phys_pkg_id = noop_default_phys_pkg_id, + .phys_pkg_id = noop_phys_pkg_id, .mps_oem_check = NULL, -- cgit v1.2.2 From 5e09954a9acc3b435ffe318b95afd3c02fae069f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 16 Oct 2009 12:31:32 +0200 Subject: x86, mce: Fix up MCE naming nomenclature Prefix global/setup routines with "mcheck_" thus differentiating from the internal facilities prefixed with "mce_". Also, prefix the per cpu calls with mcheck_cpu and rename them to reflect the MCE setup hierarchy of calls better. There should be no functionality change resulting from this patch. Signed-off-by: Borislav Petkov Cc: Andi Kleen LKML-Reference: <1255689093-26921-1-git-send-email-borislav.petkov@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 4 ++-- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 52 ++++++++++++++++++++-------------------- 3 files changed, 29 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 227a72df6441..161485da6838 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -120,9 +120,9 @@ extern int mce_disabled; extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE -void mcheck_init(struct cpuinfo_x86 *c); +void mcheck_cpu_init(struct cpuinfo_x86 *c); #else -static inline void mcheck_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} #endif #ifdef CONFIG_X86_ANCIENT_MCE diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..4df69a38be57 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -839,7 +839,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ - mcheck_init(c); + mcheck_cpu_init(c); #endif select_idle_routine(c); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0fb9dc50697e..68d968e69b13 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1136,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */ static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_timer(unsigned long data) +static void mce_start_timer(unsigned long data) { struct timer_list *t = &per_cpu(mce_timer, data); int *n; @@ -1220,7 +1220,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit mce_cap_init(void) +static int __cpuinit __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1258,7 +1258,7 @@ static int __cpuinit mce_cap_init(void) return 0; } -static void mce_init(void) +static void __mcheck_cpu_init_generic(void) { mce_banks_t all_banks; u64 cap; @@ -1287,7 +1287,7 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); @@ -1355,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) return 0; } -static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return; @@ -1369,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) } } -static void mce_cpu_features(struct cpuinfo_x86 *c) +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -1383,7 +1383,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } -static void mce_init_timer(void) +static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(mce_next_interval); @@ -1394,7 +1394,7 @@ static void mce_init_timer(void) *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mcheck_timer, smp_processor_id()); + setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } @@ -1414,26 +1414,26 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mce_disabled) return; - mce_ancient_init(c); + __mcheck_cpu_ancient_init(c); if (!mce_available(c)) return; - if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { mce_disabled = 1; return; } machine_check_vector = do_machine_check; - mce_init(); - mce_cpu_features(c); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); if (raw_smp_processor_id() == 0) @@ -1665,7 +1665,7 @@ __setup("mce", mcheck_enable); * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable(void) +static int mce_disable_error_reporting(void) { int i; @@ -1680,12 +1680,12 @@ static int mce_disable(void) static int mce_suspend(struct sys_device *dev, pm_message_t state) { - return mce_disable(); + return mce_disable_error_reporting(); } static int mce_shutdown(struct sys_device *dev) { - return mce_disable(); + return mce_disable_error_reporting(); } /* @@ -1695,8 +1695,8 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(); - mce_cpu_features(¤t_cpu_data); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(¤t_cpu_data); return 0; } @@ -1706,8 +1706,8 @@ static void mce_cpu_restart(void *data) del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(¤t_cpu_data)) return; - mce_init(); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_timer(); } /* Reinit MCEs after user configuration changes */ @@ -1733,7 +1733,7 @@ static void mce_enable_ce(void *all) cmci_reenable(); cmci_recheck(); if (all) - mce_init_timer(); + __mcheck_cpu_init_timer(); } static struct sysdev_class mce_sysclass = { @@ -2042,7 +2042,7 @@ static __init void mce_init_banks(void) } } -static __init int mce_init_device(void) +static __init int mcheck_init_device(void) { int err; int i = 0; @@ -2070,7 +2070,7 @@ static __init int mce_init_device(void) return err; } -device_initcall(mce_init_device); +device_initcall(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. @@ -2118,7 +2118,7 @@ static int fake_panic_set(void *data, u64 val) DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, "%llu\n"); -static int __init mce_debugfs_init(void) +static int __init mcheck_debugfs_init(void) { struct dentry *dmce, *ffake_panic; @@ -2132,5 +2132,5 @@ static int __init mce_debugfs_init(void) return 0; } -late_initcall(mce_debugfs_init); +late_initcall(mcheck_debugfs_init); #endif -- cgit v1.2.2 From b33a6363649f0ff83ec81597ea7fe7e688f973cb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 16 Oct 2009 12:31:33 +0200 Subject: x86, mce: Add a global MCE init helper Add an early initcall (pre SMP) which sets up global MCE functionality. Signed-off-by: Borislav Petkov Cc: Andi Kleen LKML-Reference: <1255689093-26921-2-git-send-email-borislav.petkov@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 68d968e69b13..80801705edd7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1436,8 +1436,6 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); - if (raw_smp_processor_id() == 0) - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); } /* @@ -1657,6 +1655,14 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); +static int __init mcheck_init(void) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + + return 0; +} +early_initcall(mcheck_init); + /* * Sysfs support */ -- cgit v1.2.2 From 8c95bc3e206cff7a55edd2fc5f0e2b305d57903f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 16 Oct 2009 20:07:36 -0400 Subject: x86: Add MMX/SSE opcode groups to opcode map Add missing MMX/SSE opcode groups to x86 opcode map. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <20091017000736.16556.29061.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 78a0daf12e15..e7285d8379e3 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -777,12 +777,22 @@ GrpTable: Grp11 EndTable GrpTable: Grp12 +2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B) +4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B) +6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B) EndTable GrpTable: Grp13 +2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B) +4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B) +6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B) EndTable GrpTable: Grp14 +2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B) +3: psrldq Udq,Ib (66),(11B) +6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B) +7: pslldq Udq,Ib (66),(11B) EndTable GrpTable: Grp15 -- cgit v1.2.2 From d1baf5a5a6088e2991b7dbbd370ff200bd6615ce Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 16 Oct 2009 20:07:44 -0400 Subject: x86: Add AMD prefetch and 3DNow! opcodes to opcode map Add AMD prefetch and 3DNow! opcode including FEMMS. Since 3DNow! uses the last immediate byte as an opcode extension byte, x86 insn just treats the extenstion byte as an immediate byte instead of a part of opcode (insn_get_opcode() decodes first "0x0f 0x0f" bytes.) Users who are interested in analyzing 3DNow! opcode still can decode it by analyzing the immediate byte. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <20091017000744.16556.27881.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index e7285d8379e3..894497f77808 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -306,9 +306,10 @@ Referrer: 2-byte escape 0a: 0b: UD2 (1B) 0c: -0d: NOP Ev -0e: -0f: +0d: NOP Ev | GrpP +0e: FEMMS +# 3DNow! uses the last imm byte as opcode extension. +0f: 3DNow! Pq,Qq,Ib # 0x0f 0x10-0x1f 10: movups Vps,Wps | movss Vss,Wss (F3) | movupd Vpd,Wpd (66) | movsd Vsd,Wsd (F2) 11: movups Wps,Vps | movss Wss,Vss (F3) | movupd Wpd,Vpd (66) | movsd Wsd,Vsd (F2) @@ -813,6 +814,12 @@ GrpTable: Grp16 3: prefetch T2 EndTable +# AMD's Prefetch Group +GrpTable: GrpP +0: PREFETCH +1: PREFETCHW +EndTable + GrpTable: GrpPDLK 0: MONTMUL 1: XSHA1 -- cgit v1.2.2 From 0e1227d356e9b2fe0500d6cc7084f752040a1e0e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 19 Oct 2009 11:53:06 +0900 Subject: crypto: ghash - Add PCLMULQDQ accelerated implementation PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, carry-less multiplication. More information about PCLMULQDQ can be found at: http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ Because PCLMULQDQ changes XMM state, its usage must be enclosed with kernel_fpu_begin/end, which can be used only in process context, the acceleration is implemented as crypto_ahash. That is, request in soft IRQ context will be defered to the cryptd kernel thread. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/ghash-clmulni-intel_asm.S | 157 ++++++++++++++ arch/x86/crypto/ghash-clmulni-intel_glue.c | 333 +++++++++++++++++++++++++++++ arch/x86/include/asm/cpufeature.h | 1 + 4 files changed, 494 insertions(+) create mode 100644 arch/x86/crypto/ghash-clmulni-intel_asm.S create mode 100644 arch/x86/crypto/ghash-clmulni-intel_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index cfb0010fa940..1a58ad89fdf7 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o +obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o @@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o + +ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S new file mode 100644 index 000000000000..b9e787a511da --- /dev/null +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -0,0 +1,157 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains accelerated part of ghash + * implementation. More information about PCLMULQDQ can be found at: + * + * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * Vinodh Gopal + * Erdinc Ozturk + * Deniz Karakoyunlu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include + +.align 16 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lpoly: + .octa 0xc2000000000000000000000000000001 +.Ltwo_one: + .octa 0x00000001000000000000000000000001 + +#define DATA %xmm0 +#define SHASH %xmm1 +#define T1 %xmm2 +#define T2 %xmm3 +#define T3 %xmm4 +#define BSWAP %xmm5 +#define IN1 %xmm6 + +.text + +/* + * __clmul_gf128mul_ble: internal ABI + * input: + * DATA: operand1 + * SHASH: operand2, hash_key << 1 mod poly + * output: + * DATA: operand1 * operand2 mod poly + * changed: + * T1 + * T2 + * T3 + */ +__clmul_gf128mul_ble: + movaps DATA, T1 + pshufd $0b01001110, DATA, T2 + pshufd $0b01001110, SHASH, T3 + pxor DATA, T2 + pxor SHASH, T3 + + # pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00 + # pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11 + # pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) + .byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00 + pxor DATA, T2 + pxor T1, T2 # T2 = a0 * b1 + a1 * b0 + + movaps T2, T3 + pslldq $8, T3 + psrldq $8, T2 + pxor T3, DATA + pxor T2, T1 # is result of + # carry-less multiplication + + # first phase of the reduction + movaps DATA, T3 + psllq $1, T3 + pxor DATA, T3 + psllq $5, T3 + pxor DATA, T3 + psllq $57, T3 + movaps T3, T2 + pslldq $8, T2 + psrldq $8, T3 + pxor T2, DATA + pxor T3, T1 + + # second phase of the reduction + movaps DATA, T2 + psrlq $5, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor T2, T1 + pxor T1, DATA + ret + +/* void clmul_ghash_mul(char *dst, const be128 *shash) */ +ENTRY(clmul_ghash_mul) + movups (%rdi), DATA + movups (%rsi), SHASH + movaps .Lbswap_mask, BSWAP + pshufb BSWAP, DATA + call __clmul_gf128mul_ble + pshufb BSWAP, DATA + movups DATA, (%rdi) + ret + +/* + * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + * const be128 *shash); + */ +ENTRY(clmul_ghash_update) + cmp $16, %rdx + jb .Lupdate_just_ret # check length + movaps .Lbswap_mask, BSWAP + movups (%rdi), DATA + movups (%rcx), SHASH + pshufb BSWAP, DATA +.align 4 +.Lupdate_loop: + movups (%rsi), IN1 + pshufb BSWAP, IN1 + pxor IN1, DATA + call __clmul_gf128mul_ble + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lupdate_loop + pshufb BSWAP, DATA + movups DATA, (%rdi) +.Lupdate_just_ret: + ret + +/* + * void clmul_ghash_setkey(be128 *shash, const u8 *key); + * + * Calculate hash_key << 1 mod poly + */ +ENTRY(clmul_ghash_setkey) + movaps .Lbswap_mask, BSWAP + movups (%rsi), %xmm0 + pshufb BSWAP, %xmm0 + movaps %xmm0, %xmm1 + psllq $1, %xmm0 + psrlq $63, %xmm1 + movaps %xmm1, %xmm2 + pslldq $8, %xmm1 + psrldq $8, %xmm2 + por %xmm1, %xmm0 + # reduction + pshufd $0b00100100, %xmm2, %xmm1 + pcmpeqd .Ltwo_one, %xmm1 + pand .Lpoly, %xmm1 + pxor %xmm1, %xmm0 + movups %xmm0, (%rdi) + ret diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c new file mode 100644 index 000000000000..65d409644d72 --- /dev/null +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -0,0 +1,333 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains glue code. + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +void clmul_ghash_mul(char *dst, const be128 *shash); + +void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + const be128 *shash); + +void clmul_ghash_setkey(be128 *shash, const u8 *key); + +struct ghash_async_ctx { + struct cryptd_ahash *cryptd_tfm; +}; + +struct ghash_ctx { + be128 shash; +}; + +struct ghash_desc_ctx { + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + clmul_ghash_setkey(&ctx->shash, key); + + return 0; +} + +static int ghash_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *dst = dctx->buffer; + + kernel_fpu_begin(); + if (dctx->bytes) { + int n = min(srclen, dctx->bytes); + u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + clmul_ghash_mul(dst, &ctx->shash); + } + + clmul_ghash_update(dst, src, srclen, &ctx->shash); + kernel_fpu_end(); + + if (srclen & 0xf) { + src += srclen - (srclen & 0xf); + srclen &= 0xf; + dctx->bytes = GHASH_BLOCK_SIZE - srclen; + while (srclen--) + *dst++ ^= *src++; + } + + return 0; +} + +static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +{ + u8 *dst = dctx->buffer; + + if (dctx->bytes) { + u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + while (dctx->bytes--) + *tmp++ ^= 0; + + kernel_fpu_begin(); + clmul_ghash_mul(dst, &ctx->shash); + kernel_fpu_end(); + } + + dctx->bytes = 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *buf = dctx->buffer; + + ghash_flush(ctx, dctx); + memcpy(dst, buf, GHASH_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "__ghash", + .cra_driver_name = "__ghash-pclmulqdqni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list), + }, +}; + +static int ghash_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_init(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); + } +} + +static int ghash_async_update(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_update(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return shash_ahash_update(req, desc); + } +} + +static int ghash_async_final(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_final(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return crypto_shash_final(desc, req->result); + } +} + +static int ghash_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_digest(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return shash_ahash_digest(req, desc); + } +} + +static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen) +{ + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct crypto_ahash *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ahash_setkey(child, key, keylen); + crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) + & CRYPTO_TFM_RES_MASK); + + return 0; +} + +static int ghash_async_init_tfm(struct crypto_tfm *tfm) +{ + struct cryptd_ahash *cryptd_tfm; + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ctx->cryptd_tfm = cryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&cryptd_tfm->base)); + + return 0; +} + +static void ghash_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ahash(ctx->cryptd_tfm); +} + +static struct ahash_alg ghash_async_alg = { + .init = ghash_async_init, + .update = ghash_async_update, + .final = ghash_async_final, + .setkey = ghash_async_setkey, + .digest = ghash_async_digest, + .halg = { + .digestsize = GHASH_DIGEST_SIZE, + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-clmulni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list), + .cra_init = ghash_async_init_tfm, + .cra_exit = ghash_async_exit_tfm, + }, + }, +}; + +static int __init ghash_pclmulqdqni_mod_init(void) +{ + int err; + + if (!cpu_has_pclmulqdq) { + printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not" + " detected.\n"); + return -ENODEV; + } + + err = crypto_register_shash(&ghash_alg); + if (err) + goto err_out; + err = crypto_register_ahash(&ghash_async_alg); + if (err) + goto err_shash; + + return 0; + +err_shash: + crypto_unregister_shash(&ghash_alg); +err_out: + return err; +} + +static void __exit ghash_pclmulqdqni_mod_exit(void) +{ + crypto_unregister_ahash(&ghash_async_alg); + crypto_unregister_shash(&ghash_alg); +} + +module_init(ghash_pclmulqdqni_mod_init); +module_exit(ghash_pclmulqdqni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " + "acclerated by PCLMULQDQ-NI"); +MODULE_ALIAS("ghash"); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9cfc88b97742..613700f27a4a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -248,6 +248,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) +#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- cgit v1.2.2 From b9af7c0d44b8bb71e3af5e94688d076414aa8c87 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 14 Oct 2009 14:46:55 -0700 Subject: x86-64: preserve large page mapping for 1st 2MB kernel txt with CONFIG_DEBUG_RODATA In the first 2MB, kernel text is co-located with kernel static page tables setup by head_64.S. CONFIG_DEBUG_RODATA chops this 2MB large page mapping to small 4KB pages as we mark the kernel text as RO, leaving the static page tables as RW. With CONFIG_DEBUG_RODATA disabled, OLTP run on NHM-EP shows 1% improvement with 2% reduction in system time and 1% improvement in iowait idle time. To recover this, move the kernel static page tables to .data section, so that we don't have to break the first 2MB of kernel text to small pages with CONFIG_DEBUG_RODATA. Signed-off-by: Suresh Siddha LKML-Reference: <20091014220254.063193621@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_64.S | 3 ++- arch/x86/mm/init_64.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 780cd928fcd5..b55ee4ff509f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -262,11 +262,11 @@ ENTRY(secondary_startup_64) .quad x86_64_start_kernel ENTRY(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - __FINITDATA ENTRY(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 + __FINITDATA bad_address: jmp bad_address @@ -340,6 +340,7 @@ ENTRY(name) i = i + 1 ; \ .endr + .data /* * This default setting generates an ident mapping at address 0x100000 * and a mapping for the kernel that precisely maps virtual address diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c20d30b440de..7dafd4159ad6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -699,7 +699,7 @@ static int kernel_set_to_readonly; void set_kernel_text_rw(void) { - unsigned long start = PFN_ALIGN(_stext); + unsigned long start = PFN_ALIGN(_text); unsigned long end = PFN_ALIGN(__start_rodata); if (!kernel_set_to_readonly) @@ -713,7 +713,7 @@ void set_kernel_text_rw(void) void set_kernel_text_ro(void) { - unsigned long start = PFN_ALIGN(_stext); + unsigned long start = PFN_ALIGN(_text); unsigned long end = PFN_ALIGN(__start_rodata); if (!kernel_set_to_readonly) @@ -727,7 +727,7 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); + unsigned long start = PFN_ALIGN(_text), end = PFN_ALIGN(__end_rodata); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; -- cgit v1.2.2 From 74e081797bd9d2a7d8005fe519e719df343a2ba8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 14 Oct 2009 14:46:56 -0700 Subject: x86-64: align RODATA kernel section to 2MB with CONFIG_DEBUG_RODATA CONFIG_DEBUG_RODATA chops the large pages spanning boundaries of kernel text/rodata/data to small 4KB pages as they are mapped with different attributes (text as RO, RODATA as RO and NX etc). On x86_64, preserve the large page mappings for kernel text/rodata/data boundaries when CONFIG_DEBUG_RODATA is enabled. This is done by allowing the RODATA section to be hugepage aligned and having same RWX attributes for the 2MB page boundaries Extra Memory pages padding the sections will be freed during the end of the boot and the kernel identity mappings will have different RWX permissions compared to the kernel text mappings. Kernel identity mappings to these physical pages will be mapped with smaller pages but large page mappings are still retained for kernel text,rodata,data mappings. Signed-off-by: Suresh Siddha LKML-Reference: <20091014220254.190119924@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/sections.h | 6 ++++++ arch/x86/kernel/vmlinux.lds.S | 17 +++++++++++++++++ arch/x86/mm/init_64.c | 14 +++++++++++++- arch/x86/mm/pageattr.c | 14 ++++++++++++++ 4 files changed, 50 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 1b7ee5d673c2..0a5242428659 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -2,7 +2,13 @@ #define _ASM_X86_SECTIONS_H #include +#include extern char __brk_base[], __brk_limit[]; +extern struct exception_table_entry __stop___ex_table[]; + +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +extern char __end_rodata_hpage_align[]; +#endif #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9fa..14763790e415 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -41,6 +41,21 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) + +#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); + +#define X64_ALIGN_DEBUG_RODATA_END \ + . = ALIGN(HPAGE_SIZE); \ + __end_rodata_hpage_align = .; + +#else + +#define X64_ALIGN_DEBUG_RODATA_BEGIN +#define X64_ALIGN_DEBUG_RODATA_END + +#endif + PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -90,7 +105,9 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 + X64_ALIGN_DEBUG_RODATA_BEGIN RO_DATA(PAGE_SIZE) + X64_ALIGN_DEBUG_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7dafd4159ad6..0ed09fad6aa1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -727,9 +727,13 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(_text), end = PFN_ALIGN(__end_rodata); + unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); + unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); + unsigned long data_start = (unsigned long) &_sdata; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -752,6 +756,14 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: again\n"); set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif + + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(text_end)), + (unsigned long) + page_address(virt_to_page(rodata_start))); + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(rodata_end)), + (unsigned long) page_address(virt_to_page(data_start))); } #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dd38bfbefd1f..b494fc4a986e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -279,6 +279,20 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) + /* + * Kernel text mappings for the large page aligned .rodata section + * will be read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything. + * + * This will preserve the large page mappings for kernel text/data + * at no extra cost. + */ + if (within(address, (unsigned long)_text, + (unsigned long)__end_rodata_hpage_align)) + pgprot_val(forbidden) |= _PAGE_RW; +#endif + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); return prot; -- cgit v1.2.2 From d6cc1c3af760c1d3f6b42f6e52b08718a6207cf1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Oct 2009 06:12:04 -0700 Subject: x86-64: add comment for RODATA large page retainment Add a comment explaining why RODATA is aligned to 2 MB. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 14763790e415..fd2dabec1dff 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -42,7 +42,18 @@ jiffies_64 = jiffies; #endif #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) - +/* + * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA + * we retain large page mappings for boundaries spanning kernel text, rodata + * and data sections. + * + * However, kernel identity mappings will have different RWX permissions + * to the pages mapping to text and to the pages padding (which are freed) the + * text section. Hence kernel identity mappings will be broken to smaller + * pages. For 64-bit, kernel text and kernel identity mappings are different, + * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, + * as well as retain 2MB large page mappings for kernel text. + */ #define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); #define X64_ALIGN_DEBUG_RODATA_END \ -- cgit v1.2.2 From 06ed6ba5ecb771cc3a967838a4bb1d9cbd8786b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 20 Oct 2009 12:55:24 -0400 Subject: x86: Fix group attribute decoding bug Fix a typo in inat_get_group_attribute() which should refer inat_group_tables, not inat_escape_tables. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Jim Keniston Cc: Frederic Weisbecker LKML-Reference: <20091020165524.4145.97333.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/inat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index 054656a01dfd..3fb5998b823e 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -68,7 +68,7 @@ insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx, if (!table) return inat_group_common_attribute(grp_attr); if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) { - table = inat_escape_tables[n][m]; + table = inat_group_tables[n][m]; if (!table) return inat_group_common_attribute(grp_attr); } -- cgit v1.2.2 From 9983d60d74db9e544c6cb6f65351849fe8e9c1de Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 20 Oct 2009 12:55:31 -0400 Subject: x86: Add AES opcodes to opcode map Add Intel AES opcodes to x86 opcode map. These opcodes are used in arch/x86/crypt/aesni-intel_asm.S. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Jim Keniston Cc: Frederic Weisbecker LKML-Reference: <20091020165531.4145.21872.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 894497f77808..701c4678687b 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -567,7 +567,7 @@ fe: paddd Pq,Qq | paddd Vdq,Wdq (66) ff: EndTable -Table: 3-byte opcode 1 +Table: 3-byte opcode 1 (0x0f 0x38) Referrer: 3-byte escape 1 # 0x0f 0x38 0x00-0x0f 00: pshufb Pq,Qq | pshufb Vdq,Wdq (66) @@ -642,11 +642,16 @@ Referrer: 3-byte escape 1 41: phminposuw Vdq,Wdq (66) 80: INVEPT Gd/q,Mdq (66) 81: INVPID Gd/q,Mdq (66) +db: aesimc Vdq,Wdq (66) +dc: aesenc Vdq,Wdq (66) +dd: aesenclast Vdq,Wdq (66) +de: aesdec Vdq,Wdq (66) +df: aesdeclast Vdq,Wdq (66) f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) EndTable -Table: 3-byte opcode 2 +Table: 3-byte opcode 2 (0x0f 0x3a) Referrer: 3-byte escape 2 # 0x0f 0x3a 0x00-0xff 08: roundps Vdq,Wdq,Ib (66) @@ -671,6 +676,7 @@ Referrer: 3-byte escape 2 61: pcmpestri Vdq,Wdq,Ib (66) 62: pcmpistrm Vdq,Wdq,Ib (66) 63: pcmpistri Vdq,Wdq,Ib (66) +df: aeskeygenassist Vdq,Wdq,Ib (66) EndTable GrpTable: Grp1 -- cgit v1.2.2 From 9bf4e7fba8006d19846fec877b6da0616b2772de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 21 Oct 2009 14:39:51 +0200 Subject: x86, instruction decoder: Fix test_get_len build rules Add the kernel source include file as well to the include files search path, to fix this build bug: In file included from arch/x86/tools/test_get_len.c:28: arch/x86/lib/insn.c:21:26: error: linux/string.h: No such file or directory Cc: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Jim Keniston Cc: Frederic Weisbecker LKML-Reference: <20091020165531.4145.21872.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/tools/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 1bd006c81564..5e295d95dc25 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -8,8 +8,8 @@ posttest: $(obj)/test_get_len vmlinux hostprogs-y := test_get_len # -I needed for generated C source and C source which in the kernel tree. -HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ +HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ -# Dependancies are also needed. +# Dependencies are also needed. $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c -- cgit v1.2.2 From b1258ac2963d42ee7e807d2993d15e3dd39ff4b0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 22 Oct 2009 11:27:22 +0900 Subject: x86: Remove pfn in add_one_highpage_init() commit cc9f7a0ccf000d4db5fbdc7b0ae48eefea102f69 changed add_one_highpage_init. We don't use pfn any more. Let's remove unnecessary argument. This patch doesn't chage function behavior. This patch is based on v2.6.32-rc5. Signed-off-by: Minchan Kim Cc: Yinghai Lu LKML-Reference: <20091022112722.adc8e55c.minchan.kim@barrios-desktop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5e32b07b535d..f64d0d5e0f89 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __init add_one_highpage_init(struct page *page, int pfn) +static void __init add_one_highpage_init(struct page *page) { ClearPageReserved(page); init_page_count(page); @@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn, if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn); + add_one_highpage_init(page); } return 0; -- cgit v1.2.2 From 4868402d9582bfb00a5f0157ae5d7ffd2d539fb0 Mon Sep 17 00:00:00 2001 From: Alexander Potashev Date: Sat, 24 Oct 2009 03:37:23 +0400 Subject: x86, boot: Simplify setting of the PAE bit A single 'movl' is shorter than the 'xorl'-'orl' pair. No change in behaviour. Signed-off-by: Alexander Potashev LKML-Reference: <1256341043-4928-1-git-send-email-aspotashev@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 077e1b69198e..faff0dc9c06a 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -107,8 +107,7 @@ ENTRY(startup_32) lgdt gdt(%ebp) /* Enable PAE mode */ - xorl %eax, %eax - orl $(X86_CR4_PAE), %eax + movl $(X86_CR4_PAE), %eax movl %eax, %cr4 /* -- cgit v1.2.2 From 6f9b41006af1bc489030f84ee247abc0df1edccd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 27 Oct 2009 11:01:38 +0100 Subject: x86, apic: Clear APIC Timer Initial Count Register on shutdown Commit a98f8fd24fb24fcb9a359553e64dd6aac5cf4279 (x86: apic reset counter on shutdown) set the counter to max to avoid spurious interrupts when the timer is re-enabled. (In theory) you'll still get a spurious interrupt if spending more than 344 seconds with this interrupt disabled and then unmasking it. The right thing to do is to clear the register. This disables the interrupt from happening (at least it does on AMD hardware). Signed-off-by: Andreas Herrmann LKML-Reference: <20091027100138.GB30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dce93d4b0eaf..4c689f45b238 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -444,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0xffffffff); + apic_write(APIC_TMICT, 0); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ -- cgit v1.2.2 From 883242dd0e5faaba041528a9a99f483f2a656c83 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 Oct 2009 13:15:11 -0400 Subject: tracing: allow to change permissions for text with dynamic ftrace enabled The commit 74e081797bd9d2a7d8005fe519e719df343a2ba8 x86-64: align RODATA kernel section to 2MB with CONFIG_DEBUG_RODATA prevents text sections from becoming read/write using set_memory_rw. The dynamic ftrace changes all text pages to read/write just before converting the calls to tracing to nops, and vice versa. I orginally just added a flag to allow this transaction when ftrace did the change, but I also found that when the CPA testing was running it would remove the read/write as well, and ftrace does not do the text conversion on boot up, and the CPA changes caused the dynamic tracer to fail on self tests. The current solution I have is to simply not to prevent change_page_attr from setting the RW bit for kernel text pages. Reported-by: Ingo Molnar Cc: Suresh Siddha Cc: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/mm/pageattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index b494fc4a986e..78d3168b3c64 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -279,7 +279,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && \ + !defined(CONFIG_DYNAMIC_FTRACE) /* * Kernel text mappings for the large page aligned .rodata section * will be read-only. For the kernel identity mappings covering -- cgit v1.2.2 From 7f387d3f2421781610588faa2f49ae5f1737b137 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Oct 2009 16:42:04 -0400 Subject: x86: Fix SSE opcode map bug Fix superscripts position because some superscripts of SSE opcode are not put in correct position. Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju LKML-Reference: <20091027204204.30545.97296.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 701c4678687b..efef3cada84e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -401,9 +401,9 @@ Referrer: 2-byte escape 62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66) 63: packsswb Pq,Qq | packsswb Vdq,Wdq (66) 64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66) -65: pcmpgtw Pq,Qq | pcmpgtw(66) Vdq,Wdq +65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66) 66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66) -67: packuswb Pq,Qq | packuswb(66) Vdq,Wdq +67: packuswb Pq,Qq | packuswb Vdq,Wdq (66) 68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66) 69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66) 6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66) @@ -425,8 +425,8 @@ Referrer: 2-byte escape 79: VMWRITE Gd/q,Ed/q 7a: 7b: -7c: haddps(F2) Vps,Wps | haddpd(66) Vpd,Wpd -7d: hsubps(F2) Vps,Wps | hsubpd(66) Vpd,Wpd +7c: haddps Vps,Wps (F2) | haddpd Vpd,Wpd (66) +7d: hsubps Vps,Wps (F2) | hsubpd Vpd,Wpd (66) 7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66) | movq Vq,Wq (F3) 7f: movq Qq,Pq | movdqa Wdq,Vdq (66) | movdqu Wdq,Vdq (F3) # 0x0f 0x80-0x8f @@ -574,7 +574,7 @@ Referrer: 3-byte escape 1 01: phaddw Pq,Qq | phaddw Vdq,Wdq (66) 02: phaddd Pq,Qq | phaddd Vdq,Wdq (66) 03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66) -04: pmaddubsw Pq,Qq | pmaddubsw (66)Vdq,Wdq +04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66) 05: phsubw Pq,Qq | phsubw Vdq,Wdq (66) 06: phsubd Pq,Qq | phsubd Vdq,Wdq (66) 07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66) -- cgit v1.2.2 From 04d46c1b13b02e1e5c24eb270a01cf3f94ee4d04 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Oct 2009 16:42:11 -0400 Subject: x86: Merge INAT_REXPFX into INAT_PFX_* Merge INAT_REXPFX into INAT_PFX_* macro and rename it to INAT_PFX_REX. Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju LKML-Reference: <20091027204211.30545.58090.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/inat.h | 36 ++++++++++++++++++++---------------- arch/x86/lib/insn.c | 2 +- arch/x86/tools/gen-insn-attr-x86.awk | 6 +++--- 3 files changed, 24 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 2866fddd1848..c2487d2aca25 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -30,10 +30,11 @@ #define INAT_OPCODE_TABLE_SIZE 256 #define INAT_GROUP_TABLE_SIZE 8 -/* Legacy instruction prefixes */ +/* Legacy last prefixes */ #define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ #define INAT_PFX_REPNE 2 /* 0xF2 */ /* LPFX2 */ #define INAT_PFX_REPE 3 /* 0xF3 */ /* LPFX3 */ +/* Other Legacy prefixes */ #define INAT_PFX_LOCK 4 /* 0xF0 */ #define INAT_PFX_CS 5 /* 0x2E */ #define INAT_PFX_DS 6 /* 0x3E */ @@ -42,8 +43,11 @@ #define INAT_PFX_GS 9 /* 0x65 */ #define INAT_PFX_SS 10 /* 0x36 */ #define INAT_PFX_ADDRSZ 11 /* 0x67 */ +/* x86-64 REX prefix */ +#define INAT_PFX_REX 12 /* 0x4X */ -#define INAT_LPREFIX_MAX 3 +#define INAT_LSTPFX_MAX 3 +#define INAT_LGCPFX_MAX 11 /* Immediate size */ #define INAT_IMM_BYTE 1 @@ -75,12 +79,11 @@ #define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) /* Flags */ #define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) -#define INAT_REXPFX (1 << INAT_FLAG_OFFS) -#define INAT_MODRM (1 << (INAT_FLAG_OFFS + 1)) -#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 2)) -#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 3)) -#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 4)) -#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_MODRM (1 << (INAT_FLAG_OFFS)) +#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1)) +#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2)) +#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) +#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -97,9 +100,10 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_attr_t esc_attr); /* Attribute checking functions */ -static inline int inat_is_prefix(insn_attr_t attr) +static inline int inat_is_legacy_prefix(insn_attr_t attr) { - return attr & INAT_PFX_MASK; + attr &= INAT_PFX_MASK; + return attr && attr <= INAT_LGCPFX_MAX; } static inline int inat_is_address_size_prefix(insn_attr_t attr) @@ -112,9 +116,14 @@ static inline int inat_is_operand_size_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ; } +static inline int inat_is_rex_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX; +} + static inline int inat_last_prefix_id(insn_attr_t attr) { - if ((attr & INAT_PFX_MASK) > INAT_LPREFIX_MAX) + if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) return 0; else return attr & INAT_PFX_MASK; @@ -155,11 +164,6 @@ static inline int inat_immediate_size(insn_attr_t attr) return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS; } -static inline int inat_is_rex_prefix(insn_attr_t attr) -{ - return attr & INAT_REXPFX; -} - static inline int inat_has_modrm(insn_attr_t attr) { return attr & INAT_MODRM; diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index dfd56a30053f..9f483179a8a6 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -69,7 +69,7 @@ void insn_get_prefixes(struct insn *insn) lb = 0; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); - while (inat_is_prefix(attr)) { + while (inat_is_legacy_prefix(attr)) { /* Skip if same prefix */ for (i = 0; i < nb; i++) if (prefixes->bytes[i] == b) diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 19ba096b7dd1..7d5492951e22 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -278,7 +278,7 @@ function convert_operands(opnd, i,imm,mod) # check REX prefix if (match(opcode, rex_expr)) - flags = add_flags(flags, "INAT_REXPFX") + flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)") # check coprocessor escape : TODO if (match(opcode, fpu_expr)) @@ -316,7 +316,7 @@ END { # print escape opcode map's array print "/* Escape opcode map array */" print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ - "[INAT_LPREFIX_MAX + 1] = {" + "[INAT_LSTPFX_MAX + 1] = {" for (i = 0; i < geid; i++) for (j = 0; j < max_lprefix; j++) if (etable[i,j]) @@ -325,7 +325,7 @@ END { # print group opcode map's array print "/* Group opcode map array */" print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ - "[INAT_LPREFIX_MAX + 1] = {" + "[INAT_LSTPFX_MAX + 1] = {" for (i = 0; i < ggid; i++) for (j = 0; j < max_lprefix; j++) if (gtable[i,j]) -- cgit v1.2.2 From 82cb57028c864822c5a260f806d051e2ce28c86a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Oct 2009 16:42:19 -0400 Subject: x86: Add pclmulq to x86 opcode map Add pclmulq opcode to x86 opcode map. Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju LKML-Reference: <20091027204219.30545.82039.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index efef3cada84e..1f41246e6e3c 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -672,6 +672,7 @@ Referrer: 3-byte escape 2 40: dpps Vdq,Wdq,Ib (66) 41: dppd Vdq,Wdq,Ib (66) 42: mpsadbw Vdq,Wdq,Ib (66) +44: pclmulq Vdq,Wdq,Ib (66) 60: pcmpestrm Vdq,Wdq,Ib (66) 61: pcmpestri Vdq,Wdq,Ib (66) 62: pcmpistrm Vdq,Wdq,Ib (66) -- cgit v1.2.2 From e0e492e99b372c6990a5daca9e4683c341f1330e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Oct 2009 16:42:27 -0400 Subject: x86: AVX instruction set decoder support Add Intel AVX(Advanced Vector Extensions) instruction set support to x86 instruction decoder. This adds insn.vex_prefix field for storing VEX prefixes, and introduces some original tags for expressing opcodes attributes. Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju LKML-Reference: <20091027204226.30545.23451.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/inat.h | 32 ++- arch/x86/include/asm/insn.h | 43 +++- arch/x86/lib/inat.c | 12 + arch/x86/lib/insn.c | 52 +++++ arch/x86/lib/x86-opcode-map.txt | 431 ++++++++++++++++++----------------- arch/x86/tools/gen-insn-attr-x86.awk | 94 ++++++-- 6 files changed, 431 insertions(+), 233 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index c2487d2aca25..205b063e3e32 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -32,8 +32,8 @@ /* Legacy last prefixes */ #define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ -#define INAT_PFX_REPNE 2 /* 0xF2 */ /* LPFX2 */ -#define INAT_PFX_REPE 3 /* 0xF3 */ /* LPFX3 */ +#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */ +#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */ /* Other Legacy prefixes */ #define INAT_PFX_LOCK 4 /* 0xF0 */ #define INAT_PFX_CS 5 /* 0x2E */ @@ -45,6 +45,9 @@ #define INAT_PFX_ADDRSZ 11 /* 0x67 */ /* x86-64 REX prefix */ #define INAT_PFX_REX 12 /* 0x4X */ +/* AVX VEX prefixes */ +#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ +#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -84,6 +87,8 @@ #define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2)) #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) +#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -98,6 +103,9 @@ extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx, insn_attr_t esc_attr); +extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, + insn_byte_t vex_m, + insn_byte_t vex_pp); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -129,6 +137,17 @@ static inline int inat_last_prefix_id(insn_attr_t attr) return attr & INAT_PFX_MASK; } +static inline int inat_is_vex_prefix(insn_attr_t attr) +{ + attr &= INAT_PFX_MASK; + return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3; +} + +static inline int inat_is_vex3_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -189,4 +208,13 @@ static inline int inat_has_variant(insn_attr_t attr) return attr & INAT_VARIANT; } +static inline int inat_accept_vex(insn_attr_t attr) +{ + return attr & INAT_VEXOK; +} + +static inline int inat_must_vex(insn_attr_t attr) +{ + return attr & INAT_VEXONLY; +} #endif diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 12b4e3751d3f..96c2e0ad04ca 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -39,6 +39,7 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ + struct insn_field vex_prefix; /* VEX prefix */ struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -80,6 +81,19 @@ struct insn { #define X86_REX_X(rex) ((rex) & 2) #define X86_REX_B(rex) ((rex) & 1) +/* VEX bit flags */ +#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ +#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ +#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ +#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ +#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ +/* VEX bit fields */ +#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ +#define X86_VEX2_M 1 /* VEX2.M always 1 */ +#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ +#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ +#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ + /* The last prefix is needed for two-byte and three-byte opcodes */ static inline insn_byte_t insn_last_prefix(struct insn *insn) { @@ -114,15 +128,42 @@ static inline void kernel_insn_init(struct insn *insn, const void *kaddr) #endif } +static inline int insn_is_avx(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return (insn->vex_prefix.value != 0); +} + +static inline insn_byte_t insn_vex_m_bits(struct insn *insn) +{ + if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ + return X86_VEX2_M; + else + return X86_VEX3_M(insn->vex_prefix.bytes[1]); +} + +static inline insn_byte_t insn_vex_p_bits(struct insn *insn) +{ + if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ + return X86_VEX_P(insn->vex_prefix.bytes[1]); + else + return X86_VEX_P(insn->vex_prefix.bytes[2]); +} + /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { return insn->prefixes.nbytes; } -static inline int insn_offset_opcode(struct insn *insn) +static inline int insn_offset_vex_prefix(struct insn *insn) { return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; } +static inline int insn_offset_opcode(struct insn *insn) +{ + return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; +} static inline int insn_offset_modrm(struct insn *insn) { return insn_offset_opcode(insn) + insn->opcode.nbytes; diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index 3fb5998b823e..46fc4ee09fc4 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -76,3 +76,15 @@ insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx, inat_group_common_attribute(grp_attr); } +insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, + insn_byte_t vex_p) +{ + const insn_attr_t *table; + if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) + return 0; + table = inat_avx_tables[vex_m][vex_p]; + if (!table) + return 0; + return table[opcode]; +} + diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 9f483179a8a6..9f33b984d0ef 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -28,6 +28,9 @@ #define peek_next(t, insn) \ ({t r; r = *(t*)insn->next_byte; r; }) +#define peek_nbyte_next(t, insn, n) \ + ({t r; r = *(t*)((insn)->next_byte + n); r; }) + /** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized @@ -107,6 +110,7 @@ found: insn->prefixes.bytes[3] = lb; } + /* Decode REX prefix */ if (insn->x86_64) { b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); @@ -120,6 +124,39 @@ found: } } insn->rex_prefix.got = 1; + + /* Decode VEX prefix */ + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_vex_prefix(attr)) { + insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); + if (!insn->x86_64) { + /* + * In 32-bits mode, if the [7:6] bits (mod bits of + * ModRM) on the second byte are not 11b, it is + * LDS or LES. + */ + if (X86_MODRM_MOD(b2) != 3) + goto vex_end; + } + insn->vex_prefix.bytes[0] = b; + insn->vex_prefix.bytes[1] = b2; + if (inat_is_vex3_prefix(attr)) { + b2 = peek_nbyte_next(insn_byte_t, insn, 2); + insn->vex_prefix.bytes[2] = b2; + insn->vex_prefix.nbytes = 3; + insn->next_byte += 3; + if (insn->x86_64 && X86_VEX_W(b2)) + /* VEX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } else { + insn->vex_prefix.nbytes = 2; + insn->next_byte += 2; + } + } +vex_end: + insn->vex_prefix.got = 1; + prefixes->got = 1; return; } @@ -147,6 +184,18 @@ void insn_get_opcode(struct insn *insn) op = get_next(insn_byte_t, insn); opcode->bytes[0] = op; opcode->nbytes = 1; + + /* Check if there is VEX prefix or not */ + if (insn_is_avx(insn)) { + insn_byte_t m, p; + m = insn_vex_m_bits(insn); + p = insn_vex_p_bits(insn); + insn->attr = inat_get_avx_attribute(op, m, p); + if (!inat_accept_vex(insn->attr)) + insn->attr = 0; /* This instruction is bad */ + goto end; /* VEX has only 1 byte for opcode */ + } + insn->attr = inat_get_opcode_attribute(op); while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ @@ -155,6 +204,9 @@ void insn_get_opcode(struct insn *insn) pfx = insn_last_prefix(insn); insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); } + if (inat_must_vex(insn->attr)) + insn->attr = 0; /* This instruction is bad */ +end: opcode->got = 1; } diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 1f41246e6e3c..9887bfeeb2db 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -3,6 +3,7 @@ # # Table: table-name # Referrer: escaped-name +# AVXcode: avx-code # opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] # (or) # opcode: escape # escaped-name @@ -13,9 +14,16 @@ # reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] # EndTable # +# AVX Superscripts +# (VEX): this opcode can accept VEX prefix. +# (oVEX): this opcode requires VEX prefix. +# (o128): this opcode only supports 128bit VEX. +# (o256): this opcode only supports 256bit VEX. +# Table: one byte opcode Referrer: +AVXcode: # 0x00 - 0x0f 00: ADD Eb,Gb 01: ADD Ev,Gv @@ -225,8 +233,8 @@ c0: Grp2 Eb,Ib (1A) c1: Grp2 Ev,Ib (1A) c2: RETN Iw (f64) c3: RETN -c4: LES Gz,Mp (i64) -c5: LDS Gz,Mp (i64) +c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix) +c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix) c6: Grp11 Eb,Ib (1A) c7: Grp11 Ev,Iz (1A) c8: ENTER Iw,Ib @@ -290,8 +298,9 @@ fe: Grp4 (1A) ff: Grp5 (1A) EndTable -Table: 2-byte opcode # First Byte is 0x0f +Table: 2-byte opcode (0x0f) Referrer: 2-byte escape +AVXcode: 1 # 0x0f 0x00-0x0f 00: Grp6 (1A) 01: Grp7 (1A) @@ -311,14 +320,14 @@ Referrer: 2-byte escape # 3DNow! uses the last imm byte as opcode extension. 0f: 3DNow! Pq,Qq,Ib # 0x0f 0x10-0x1f -10: movups Vps,Wps | movss Vss,Wss (F3) | movupd Vpd,Wpd (66) | movsd Vsd,Wsd (F2) -11: movups Wps,Vps | movss Wss,Vss (F3) | movupd Wpd,Vpd (66) | movsd Wsd,Vsd (F2) -12: movlps Vq,Mq | movlpd Vq,Mq (66) | movhlps Vq,Uq | movddup Vq,Wq (F2) | movsldup Vq,Wq (F3) -13: mpvlps Mq,Vq | movlpd Mq,Vq (66) -14: unpcklps Vps,Wq | unpcklpd Vpd,Wq (66) -15: unpckhps Vps,Wq | unpckhpd Vpd,Wq (66) -16: movhps Vq,Mq | movhpd Vq,Mq (66) | movlsps Vq,Uq | movshdup Vq,Wq (F3) -17: movhps Mq,Vq | movhpd Mq,Vq (66) +10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128) +11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128) +12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX) +13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128) +14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX) +15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX) +16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX) +17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128) 18: Grp16 (1A) 19: 1a: @@ -336,14 +345,14 @@ Referrer: 2-byte escape 25: 26: 27: -28: movaps Vps,Wps | movapd Vpd,Wpd (66) -29: movaps Wps,Vps | movapd Wpd,Vpd (66) -2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2) -2b: movntps Mps,Vps | movntpd Mpd,Vpd (66) -2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2) -2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2) -2e: ucomiss Vss,Wss | ucomisd Vsd,Wsd (66) -2f: comiss Vss,Wss | comisd Vsd,Wsd (66) +28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX) +29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX) +2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128) +2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX) +2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128) +2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128) +2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128) +2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128) # 0x0f 0x30-0x3f 30: WRMSR 31: RDTSC @@ -379,56 +388,56 @@ Referrer: 2-byte escape 4e: CMOVLE/NG Gv,Ev 4f: CMOVNLE/G Gv,Ev # 0x0f 0x50-0x5f -50: movmskps Gd/q,Ups | movmskpd Gd/q,Upd (66) -51: sqrtps Vps,Wps | sqrtss Vss,Wss (F3) | sqrtpd Vpd,Wpd (66) | sqrtsd Vsd,Wsd (F2) -52: rsqrtps Vps,Wps | rsqrtss Vss,Wss (F3) -53: rcpps Vps,Wps | rcpss Vss,Wss (F3) -54: andps Vps,Wps | andpd Vpd,Wpd (66) -55: andnps Vps,Wps | andnpd Vpd,Wpd (66) -56: orps Vps,Wps | orpd Vpd,Wpd (66) -57: xorps Vps,Wps | xorpd Vpd,Wpd (66) -58: addps Vps,Wps | addss Vss,Wss (F3) | addpd Vpd,Wpd (66) | addsd Vsd,Wsd (F2) -59: mulps Vps,Wps | mulss Vss,Wss (F3) | mulpd Vpd,Wpd (66) | mulsd Vsd,Wsd (F2) -5a: cvtps2pd Vpd,Wps | cvtss2sd Vsd,Wss (F3) | cvtpd2ps Vps,Wpd (66) | cvtsd2ss Vsd,Wsd (F2) -5b: cvtdq2ps Vps,Wdq | cvtps2dq Vdq,Wps (66) | cvttps2dq Vdq,Wps (F3) -5c: subps Vps,Wps | subss Vss,Wss (F3) | subpd Vpd,Wpd (66) | subsd Vsd,Wsd (F2) -5d: minps Vps,Wps | minss Vss,Wss (F3) | minpd Vpd,Wpd (66) | minsd Vsd,Wsd (F2) -5e: divps Vps,Wps | divss Vss,Wss (F3) | divpd Vpd,Wpd (66) | divsd Vsd,Wsd (F2) -5f: maxps Vps,Wps | maxss Vss,Wss (F3) | maxpd Vpd,Wpd (66) | maxsd Vsd,Wsd (F2) +50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX) +51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128) +52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128) +53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128) +54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX) +55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX) +56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX) +57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX) +58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128) +59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128) +5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128) +5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX) +5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128) +5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128) +5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128) +5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128) # 0x0f 0x60-0x6f -60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66) -61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66) -62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66) -63: packsswb Pq,Qq | packsswb Vdq,Wdq (66) -64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66) -65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66) -66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66) -67: packuswb Pq,Qq | packuswb Vdq,Wdq (66) -68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66) -69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66) -6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66) -6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66) -6c: punpcklqdq Vdq,Wdq (66) -6d: punpckhqdq Vdq,Wdq (66) -6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66) -6f: movq Pq,Qq | movdqa Vdq,Wdq (66) | movdqu Vdq,Wdq (F3) +60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128) +61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128) +62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128) +63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128) +64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128) +65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128) +66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128) +67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128) +68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128) +69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128) +6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128) +6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128) +6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128) +6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128) +6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128) +6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX) # 0x0f 0x70-0x7f -70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66) | pshufhw Vdq,Wdq,Ib (F3) | pshuflw VdqWdq,Ib (F2) +70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128) 71: Grp12 (1A) 72: Grp13 (1A) 73: Grp14 (1A) -74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66) -75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66) -76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66) -77: emms +74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128) +75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128) +76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128) +77: emms/vzeroupper/vzeroall (VEX) 78: VMREAD Ed/q,Gd/q 79: VMWRITE Gd/q,Ed/q 7a: 7b: -7c: haddps Vps,Wps (F2) | haddpd Vpd,Wpd (66) -7d: hsubps Vps,Wps (F2) | hsubpd Vpd,Wpd (66) -7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66) | movq Vq,Wq (F3) -7f: movq Qq,Pq | movdqa Wdq,Vdq (66) | movdqu Wdq,Vdq (F3) +7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX) +7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX) +7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128) +7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX) # 0x0f 0x80-0x8f 80: JO Jz (f64) 81: JNO Jz (f64) @@ -500,11 +509,11 @@ bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf c0: XADD Eb,Gb c1: XADD Ev,Gv -c2: cmpps Vps,Wps,Ib | cmpss Vss,Wss,Ib (F3) | cmppd Vpd,Wpd,Ib (66) | cmpsd Vsd,Wsd,Ib (F2) +c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX) c3: movnti Md/q,Gd/q -c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66) -c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66) -c6: shufps Vps,Wps,Ib | shufpd Vpd,Wpd,Ib (66) +c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128) +c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128) +c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX) c7: Grp9 (1A) c8: BSWAP RAX/EAX/R8/R8D c9: BSWAP RCX/ECX/R9/R9D @@ -515,77 +524,78 @@ cd: BSWAP RBP/EBP/R13/R13D ce: BSWAP RSI/ESI/R14/R14D cf: BSWAP RDI/EDI/R15/R15D # 0x0f 0xd0-0xdf -d0: addsubps Vps,Wps (F2) | addsubpd Vpd,Wpd (66) -d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66) -d2: psrld Pq,Qq | psrld Vdq,Wdq (66) -d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66) -d4: paddq Pq,Qq | paddq Vdq,Wdq (66) -d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66) -d6: movq Wq,Vq (66) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) -d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66) -d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66) -d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66) -da: pminub Pq,Qq | pminub Vdq,Wdq (66) -db: pand Pq,Qq | pand Vdq,Wdq (66) -dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66) -dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66) -de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66) -df: pandn Pq,Qq | pandn Vdq,Wdq (66) +d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX) +d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128) +d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128) +d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128) +d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128) +d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128) +d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128) +d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128) +d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128) +da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128) +db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128) +dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128) +dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128) +de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128) +df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128) # 0x0f 0xe0-0xef -e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66) -e1: psraw Pq,Qq | psraw Vdq,Wdq (66) -e2: psrad Pq,Qq | psrad Vdq,Wdq (66) -e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66) -e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66) -e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66) -e6: cvtpd2dq Vdq,Wpd (F2) | cvttpd2dq Vdq,Wpd (66) | cvtdq2pd Vpd,Wdq (F3) -e7: movntq Mq,Pq | movntdq Mdq,Vdq (66) -e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66) -e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66) -ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66) -eb: por Pq,Qq | por Vdq,Wdq (66) -ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66) -ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66) -ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66) -ef: pxor Pq,Qq | pxor Vdq,Wdq (66) +e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128) +e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128) +e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128) +e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128) +e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128) +e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128) +e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX) +e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX) +e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128) +e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128) +ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128) +eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128) +ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128) +ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128) +ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128) +ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128) # 0x0f 0xf0-0xff -f0: lddqu Vdq,Mdq (F2) -f1: psllw Pq,Qq | psllw Vdq,Wdq (66) -f2: pslld Pq,Qq | pslld Vdq,Wdq (66) -f3: psllq Pq,Qq | psllq Vdq,Wdq (66) -f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66) -f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66) -f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66) -f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66) -f8: psubb Pq,Qq | psubb Vdq,Wdq (66) -f9: psubw Pq,Qq | psubw Vdq,Wdq (66) -fa: psubd Pq,Qq | psubd Vdq,Wdq (66) -fb: psubq Pq,Qq | psubq Vdq,Wdq (66) -fc: paddb Pq,Qq | paddb Vdq,Wdq (66) -fd: paddw Pq,Qq | paddw Vdq,Wdq (66) -fe: paddd Pq,Qq | paddd Vdq,Wdq (66) +f0: lddqu Vdq,Mdq (F2),(VEX) +f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128) +f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128) +f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128) +f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128) +f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128) +f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128) +f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128) +f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128) +f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128) +fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128) +fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128) +fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128) +fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128) +fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128) ff: EndTable Table: 3-byte opcode 1 (0x0f 0x38) Referrer: 3-byte escape 1 +AVXcode: 2 # 0x0f 0x38 0x00-0x0f -00: pshufb Pq,Qq | pshufb Vdq,Wdq (66) -01: phaddw Pq,Qq | phaddw Vdq,Wdq (66) -02: phaddd Pq,Qq | phaddd Vdq,Wdq (66) -03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66) -04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66) -05: phsubw Pq,Qq | phsubw Vdq,Wdq (66) -06: phsubd Pq,Qq | phsubd Vdq,Wdq (66) -07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66) -08: psignb Pq,Qq | psignb Vdq,Wdq (66) -09: psignw Pq,Qq | psignw Vdq,Wdq (66) -0a: psignd Pq,Qq | psignd Vdq,Wdq (66) -0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66) -0c: -0d: -0e: -0f: +00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128) +01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128) +02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128) +03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128) +04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128) +05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128) +06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128) +07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128) +08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128) +09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128) +0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128) +0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128) +0c: Vpermilps /r (66),(oVEX) +0d: Vpermilpd /r (66),(oVEX) +0e: vtestps /r (66),(oVEX) +0f: vtestpd /r (66),(oVEX) # 0x0f 0x38 0x10-0x1f 10: pblendvb Vdq,Wdq (66) 11: @@ -594,90 +604,99 @@ Referrer: 3-byte escape 1 14: blendvps Vdq,Wdq (66) 15: blendvpd Vdq,Wdq (66) 16: -17: ptest Vdq,Wdq (66) -18: -19: -1a: +17: ptest Vdq,Wdq (66),(VEX) +18: vbroadcastss /r (66),(oVEX) +19: vbroadcastsd /r (66),(oVEX),(o256) +1a: vbroadcastf128 /r (66),(oVEX),(o256) 1b: -1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66) -1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66) -1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66) +1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128) +1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128) +1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128) 1f: # 0x0f 0x38 0x20-0x2f -20: pmovsxbw Vdq,Udq/Mq (66) -21: pmovsxbd Vdq,Udq/Md (66) -22: pmovsxbq Vdq,Udq/Mw (66) -23: pmovsxwd Vdq,Udq/Mq (66) -24: pmovsxwq Vdq,Udq/Md (66) -25: pmovsxdq Vdq,Udq/Mq (66) +20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128) +21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128) +22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128) +23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128) +24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128) +25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128) 26: 27: -28: pmuldq Vdq,Wdq (66) -29: pcmpeqq Vdq,Wdq (66) -2a: movntdqa Vdq,Mdq (66) -2b: packusdw Vdq,Wdq (66) -2c: -2d: -2e: -2f: +28: pmuldq Vdq,Wdq (66),(VEX),(o128) +29: pcmpeqq Vdq,Wdq (66),(VEX),(o128) +2a: movntdqa Vdq,Mdq (66),(VEX),(o128) +2b: packusdw Vdq,Wdq (66),(VEX),(o128) +2c: vmaskmovps(ld) /r (66),(oVEX) +2d: vmaskmovpd(ld) /r (66),(oVEX) +2e: vmaskmovps(st) /r (66),(oVEX) +2f: vmaskmovpd(st) /r (66),(oVEX) # 0x0f 0x38 0x30-0x3f -30: pmovzxbw Vdq,Udq/Mq (66) -31: pmovzxbd Vdq,Udq/Md (66) -32: pmovzxbq Vdq,Udq/Mw (66) -33: pmovzxwd Vdq,Udq/Mq (66) -34: pmovzxwq Vdq,Udq/Md (66) -35: pmovzxdq Vdq,Udq/Mq (66) +30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128) +31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128) +32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128) +33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128) +34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128) +35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128) 36: -37: pcmpgtq Vdq,Wdq (66) -38: pminsb Vdq,Wdq (66) -39: pminsd Vdq,Wdq (66) -3a: pminuw Vdq,Wdq (66) -3b: pminud Vdq,Wdq (66) -3c: pmaxsb Vdq,Wdq (66) -3d: pmaxsd Vdq,Wdq (66) -3e: pmaxuw Vdq,Wdq (66) -3f: pmaxud Vdq,Wdq (66) +37: pcmpgtq Vdq,Wdq (66),(VEX),(o128) +38: pminsb Vdq,Wdq (66),(VEX),(o128) +39: pminsd Vdq,Wdq (66),(VEX),(o128) +3a: pminuw Vdq,Wdq (66),(VEX),(o128) +3b: pminud Vdq,Wdq (66),(VEX),(o128) +3c: pmaxsb Vdq,Wdq (66),(VEX),(o128) +3d: pmaxsd Vdq,Wdq (66),(VEX),(o128) +3e: pmaxuw Vdq,Wdq (66),(VEX),(o128) +3f: pmaxud Vdq,Wdq (66),(VEX),(o128) # 0x0f 0x38 0x4f-0xff -40: pmulld Vdq,Wdq (66) -41: phminposuw Vdq,Wdq (66) +40: pmulld Vdq,Wdq (66),(VEX),(o128) +41: phminposuw Vdq,Wdq (66),(VEX),(o128) 80: INVEPT Gd/q,Mdq (66) 81: INVPID Gd/q,Mdq (66) -db: aesimc Vdq,Wdq (66) -dc: aesenc Vdq,Wdq (66) -dd: aesenclast Vdq,Wdq (66) -de: aesdec Vdq,Wdq (66) -df: aesdeclast Vdq,Wdq (66) +db: aesimc Vdq,Wdq (66),(VEX),(o128) +dc: aesenc Vdq,Wdq (66),(VEX),(o128) +dd: aesenclast Vdq,Wdq (66),(VEX),(o128) +de: aesdec Vdq,Wdq (66),(VEX),(o128) +df: aesdeclast Vdq,Wdq (66),(VEX),(o128) f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) EndTable Table: 3-byte opcode 2 (0x0f 0x3a) Referrer: 3-byte escape 2 +AVXcode: 3 # 0x0f 0x3a 0x00-0xff -08: roundps Vdq,Wdq,Ib (66) -09: roundpd Vdq,Wdq,Ib (66) -0a: roundss Vss,Wss,Ib (66) -0b: roundsd Vsd,Wsd,Ib (66) -0c: blendps Vdq,Wdq,Ib (66) -0d: blendpd Vdq,Wdq,Ib (66) -0e: pblendw Vdq,Wdq,Ib (66) -0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66) -14: pextrb Rd/Mb,Vdq,Ib (66) -15: pextrw Rd/Mw,Vdq,Ib (66) -16: pextrd/pextrq Ed/q,Vdq,Ib (66) -17: extractps Ed,Vdq,Ib (66) -20: pinsrb Vdq,Rd/q/Mb,Ib (66) -21: insertps Vdq,Udq/Md,Ib (66) -22: pinsrd/pinsrq Vdq,Ed/q,Ib (66) -40: dpps Vdq,Wdq,Ib (66) -41: dppd Vdq,Wdq,Ib (66) -42: mpsadbw Vdq,Wdq,Ib (66) -44: pclmulq Vdq,Wdq,Ib (66) -60: pcmpestrm Vdq,Wdq,Ib (66) -61: pcmpestri Vdq,Wdq,Ib (66) -62: pcmpistrm Vdq,Wdq,Ib (66) -63: pcmpistri Vdq,Wdq,Ib (66) -df: aeskeygenassist Vdq,Wdq,Ib (66) +04: vpermilps /r,Ib (66),(oVEX) +05: vpermilpd /r,Ib (66),(oVEX) +06: vperm2f128 /r,Ib (66),(oVEX),(o256) +08: roundps Vdq,Wdq,Ib (66),(VEX) +09: roundpd Vdq,Wdq,Ib (66),(VEX) +0a: roundss Vss,Wss,Ib (66),(VEX),(o128) +0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128) +0c: blendps Vdq,Wdq,Ib (66),(VEX) +0d: blendpd Vdq,Wdq,Ib (66),(VEX) +0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128) +0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128) +14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128) +15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128) +16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128) +17: extractps Ed,Vdq,Ib (66),(VEX),(o128) +18: vinsertf128 /r,Ib (66),(oVEX),(o256) +19: vextractf128 /r,Ib (66),(oVEX),(o256) +20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128) +21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128) +22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128) +40: dpps Vdq,Wdq,Ib (66),(VEX) +41: dppd Vdq,Wdq,Ib (66),(VEX),(o128) +42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128) +44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128) +4a: vblendvps /r,Ib (66),(oVEX) +4b: vblendvpd /r,Ib (66),(oVEX) +4c: vpblendvb /r,Ib (66),(oVEX),(o128) +60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128) +61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128) +62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128) +63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128) +df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128) EndTable GrpTable: Grp1 @@ -785,29 +804,29 @@ GrpTable: Grp11 EndTable GrpTable: Grp12 -2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B) -4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B) -6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B) +2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128) +4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128) +6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128) EndTable GrpTable: Grp13 -2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B) -4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B) -6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B) +2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128) +4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128) +6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128) EndTable GrpTable: Grp14 -2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B) -3: psrldq Udq,Ib (66),(11B) -6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B) -7: pslldq Udq,Ib (66),(11B) +2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128) +3: psrldq Udq,Ib (66),(11B),(VEX),(o128) +6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128) +7: pslldq Udq,Ib (66),(11B),(VEX),(o128) EndTable GrpTable: Grp15 0: fxsave 1: fxstor -2: ldmxcsr -3: stmxcsr +2: ldmxcsr (VEX) +3: stmxcsr (VEX) 4: XSAVE 5: XRSTOR | lfence (11B) 6: mfence (11B) diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 7d5492951e22..e34e92a28eb6 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -13,6 +13,18 @@ function check_awk_implement() { return "" } +# Clear working vars +function clear_vars() { + delete table + delete lptable2 + delete lptable1 + delete lptable3 + eid = -1 # escape id + gid = -1 # group id + aid = -1 # AVX id + tname = "" +} + BEGIN { # Implementation error checking awkchecked = check_awk_implement() @@ -24,11 +36,15 @@ BEGIN { # Setup generating tables print "/* x86 opcode map generated from x86-opcode-map.txt */" - print "/* Do not change this code. */" + print "/* Do not change this code. */\n" ggid = 1 geid = 1 + gaid = 0 + delete etable + delete gtable + delete atable - opnd_expr = "^[[:alpha:]]" + opnd_expr = "^[[:alpha:]/]" ext_expr = "^\\(" sep_expr = "^\\|$" group_expr = "^Grp[[:alnum:]]+" @@ -46,19 +62,19 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" - modrm_expr = "^([CDEGMNPQRSUVW][[:lower:]]+|NTA|T[012])" + modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])" force64_expr = "\\([df]64\\)" rex_expr = "^REX(\\.[XRWB]+)*" fpu_expr = "^ESC" # TODO lprefix1_expr = "\\(66\\)" - delete lptable1 - lprefix2_expr = "\\(F2\\)" - delete lptable2 - lprefix3_expr = "\\(F3\\)" - delete lptable3 + lprefix2_expr = "\\(F3\\)" + lprefix3_expr = "\\(F2\\)" max_lprefix = 4 + vexok_expr = "\\(VEX\\)" + vexonly_expr = "\\(oVEX\\)" + prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" prefix_num["REPNE"] = "INAT_PFX_REPNE" @@ -71,12 +87,10 @@ BEGIN { prefix_num["SEG=GS"] = "INAT_PFX_GS" prefix_num["SEG=SS"] = "INAT_PFX_SS" prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" + prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2" + prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3" - delete table - delete etable - delete gtable - eid = -1 - gid = -1 + clear_vars() } function semantic_error(msg) { @@ -97,14 +111,12 @@ function array_size(arr, i,c) { /^Table:/ { print "/* " $0 " */" + if (tname != "") + semantic_error("Hit Table: before EndTable:."); } /^Referrer:/ { - if (NF == 1) { - # primary opcode table - tname = "inat_primary_table" - eid = -1 - } else { + if (NF != 1) { # escape opcode table ref = "" for (i = 2; i <= NF; i++) @@ -114,6 +126,19 @@ function array_size(arr, i,c) { } } +/^AVXcode:/ { + if (NF != 1) { + # AVX/escape opcode table + aid = $2 + if (gaid <= aid) + gaid = aid + 1 + if (tname == "") # AVX only opcode table + tname = sprintf("inat_avx_table_%d", $2) + } + if (aid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -162,30 +187,33 @@ function print_table(tbl,name,fmt,n) print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]", "0x%02x", 256) etable[eid,0] = tname + if (aid >= 0) + atable[aid,0] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", "0x%02x", 256) etable[eid,1] = tname "_1" + if (aid >= 0) + atable[aid,1] = tname "_1" } if (array_size(lptable2) != 0) { print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]", "0x%02x", 256) etable[eid,2] = tname "_2" + if (aid >= 0) + atable[aid,2] = tname "_2" } if (array_size(lptable3) != 0) { print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]", "0x%02x", 256) etable[eid,3] = tname "_3" + if (aid >= 0) + atable[aid,3] = tname "_3" } } print "" - delete table - delete lptable1 - delete lptable2 - delete lptable3 - gid = -1 - eid = -1 + clear_vars() } function add_flags(old,new) { @@ -284,6 +312,14 @@ function convert_operands(opnd, i,imm,mod) if (match(opcode, fpu_expr)) flags = add_flags(flags, "INAT_MODRM") + # check VEX only code + if (match(ext, vexonly_expr)) + flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") + + # check VEX only code + if (match(ext, vexok_expr)) + flags = add_flags(flags, "INAT_VEXOK") + # check prefixes if (match(ext, prefix_expr)) { if (!prefix_num[opcode]) @@ -330,5 +366,15 @@ END { for (j = 0; j < max_lprefix; j++) if (gtable[i,j]) print " ["i"]["j"] = "gtable[i,j]"," + print "};\n" + # print AVX opcode map's array + print "/* AVX opcode map array */" + print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < gaid; i++) + for (j = 0; j < max_lprefix; j++) + if (atable[i,j]) + print " ["i"]["j"] = "atable[i,j]"," print "};" } + -- cgit v1.2.2 From 3f7e454af1dd8b9cea410d9380d3f71477e94f2b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Oct 2009 16:42:35 -0400 Subject: x86: Add Intel FMA instructions to x86 opcode map Add Intel FMA(FUSED-MULTIPLY-ADD) instructions to x86 opcode map for x86 instruction decoder. Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju LKML-Reference: <20091027204235.30545.33997.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 9887bfeeb2db..a793da5e560e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -647,11 +647,43 @@ AVXcode: 2 3d: pmaxsd Vdq,Wdq (66),(VEX),(o128) 3e: pmaxuw Vdq,Wdq (66),(VEX),(o128) 3f: pmaxud Vdq,Wdq (66),(VEX),(o128) -# 0x0f 0x38 0x4f-0xff +# 0x0f 0x38 0x40-0x8f 40: pmulld Vdq,Wdq (66),(VEX),(o128) 41: phminposuw Vdq,Wdq (66),(VEX),(o128) 80: INVEPT Gd/q,Mdq (66) 81: INVPID Gd/q,Mdq (66) +# 0x0f 0x38 0x90-0xbf (FMA) +96: vfmaddsub132pd/ps /r (66),(VEX) +97: vfmsubadd132pd/ps /r (66),(VEX) +98: vfmadd132pd/ps /r (66),(VEX) +99: vfmadd132sd/ss /r (66),(VEX),(o128) +9a: vfmsub132pd/ps /r (66),(VEX) +9b: vfmsub132sd/ss /r (66),(VEX),(o128) +9c: vfnmadd132pd/ps /r (66),(VEX) +9d: vfnmadd132sd/ss /r (66),(VEX),(o128) +9e: vfnmsub132pd/ps /r (66),(VEX) +9f: vfnmsub132sd/ss /r (66),(VEX),(o128) +a6: vfmaddsub213pd/ps /r (66),(VEX) +a7: vfmsubadd213pd/ps /r (66),(VEX) +a8: vfmadd213pd/ps /r (66),(VEX) +a9: vfmadd213sd/ss /r (66),(VEX),(o128) +aa: vfmsub213pd/ps /r (66),(VEX) +ab: vfmsub213sd/ss /r (66),(VEX),(o128) +ac: vfnmadd213pd/ps /r (66),(VEX) +ad: vfnmadd213sd/ss /r (66),(VEX),(o128) +ae: vfnmsub213pd/ps /r (66),(VEX) +af: vfnmsub213sd/ss /r (66),(VEX),(o128) +b6: vfmaddsub231pd/ps /r (66),(VEX) +b7: vfmsubadd231pd/ps /r (66),(VEX) +b8: vfmadd231pd/ps /r (66),(VEX) +b9: vfmadd231sd/ss /r (66),(VEX),(o128) +ba: vfmsub231pd/ps /r (66),(VEX) +bb: vfmsub231sd/ss /r (66),(VEX),(o128) +bc: vfnmadd231pd/ps /r (66),(VEX) +bd: vfnmadd231sd/ss /r (66),(VEX),(o128) +be: vfnmsub231pd/ps /r (66),(VEX) +bf: vfnmsub231sd/ss /r (66),(VEX),(o128) +# 0x0f 0x38 0xc0-0xff db: aesimc Vdq,Wdq (66),(VEX),(o128) dc: aesenc Vdq,Wdq (66),(VEX),(o128) dd: aesenclast Vdq,Wdq (66),(VEX),(o128) -- cgit v1.2.2 From 0f5e4816dbf38ce9488e611ca2296925c1e90d5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:12 +0900 Subject: percpu: remove some sparse warnings Make the following changes to remove some sparse warnings. * Make DEFINE_PER_CPU_SECTION() declare __pcpu_unique_* before defining it. * Annotate pcpu_extend_area_map() that it is entered with pcpu_lock held, releases it and then reacquires it. * Make percpu related macros use unique nested variable names. * While at it, add pcpu prefix to __size_call[_return]() macros as to-be-implemented sparse annotations will add percpu specific stuff to these macros. Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Rusty Russell --- arch/x86/include/asm/percpu.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 8b5ec19bdef4..0c44196b78ac 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -74,31 +74,31 @@ extern void __bad_percpu_size(void); #define percpu_to_op(op, var, val) \ do { \ - typedef typeof(var) T__; \ + typedef typeof(var) pto_T__; \ if (0) { \ - T__ tmp__; \ - tmp__ = (val); \ + pto_T__ pto_tmp__; \ + pto_tmp__ = (val); \ } \ switch (sizeof(var)) { \ case 1: \ asm(op "b %1,"__percpu_arg(0) \ : "+m" (var) \ - : "qi" ((T__)(val))); \ + : "qi" ((pto_T__)(val))); \ break; \ case 2: \ asm(op "w %1,"__percpu_arg(0) \ : "+m" (var) \ - : "ri" ((T__)(val))); \ + : "ri" ((pto_T__)(val))); \ break; \ case 4: \ asm(op "l %1,"__percpu_arg(0) \ : "+m" (var) \ - : "ri" ((T__)(val))); \ + : "ri" ((pto_T__)(val))); \ break; \ case 8: \ asm(op "q %1,"__percpu_arg(0) \ : "+m" (var) \ - : "re" ((T__)(val))); \ + : "re" ((pto_T__)(val))); \ break; \ default: __bad_percpu_size(); \ } \ @@ -106,31 +106,31 @@ do { \ #define percpu_from_op(op, var, constraint) \ ({ \ - typeof(var) ret__; \ + typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ - : "=q" (ret__) \ + : "=q" (pfo_ret__) \ : constraint); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ - : "=r" (ret__) \ + : "=r" (pfo_ret__) \ : constraint); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ - : "=r" (ret__) \ + : "=r" (pfo_ret__) \ : constraint); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ - : "=r" (ret__) \ + : "=r" (pfo_ret__) \ : constraint); \ break; \ default: __bad_percpu_size(); \ } \ - ret__; \ + pfo_ret__; \ }) /* -- cgit v1.2.2 From f16250669d78a32bdfb27cec4d791e85141e11e2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:13 +0900 Subject: percpu: make percpu symbols in cpufreq unique This patch updates percpu related symbols in cpufreq such that percpu symbols are unique and don't clash with local symbols. This serves two purposes of decreasing the possibility of global percpu symbol collision and allowing dropping per_cpu__ prefix from percpu symbols. * drivers/cpufreq/cpufreq.c: s/policy_cpu/cpufreq_policy_cpu/ * drivers/cpufreq/freq_table.c: s/show_table/cpufreq_show_table/ * arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c: s/drv_data/acfreq_data/ s/old_perf/acfreq_old_perf/ Partly based on Rusty Russell's "alloc_percpu: rename percpu vars which cause name clashes" patch. Signed-off-by: Tejun Heo Cc: Rusty Russell --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 7d5c3b0ea8da..43eb3465dda7 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -68,9 +68,9 @@ struct acpi_cpufreq_data { unsigned int cpu_feature; }; -static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); +static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); -static DEFINE_PER_CPU(struct aperfmperf, old_perf); +static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -214,14 +214,14 @@ static u32 get_cur_val(const struct cpumask *mask) if (unlikely(cpumask_empty(mask))) return 0; - switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) { + switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; - perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data; + perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data; cmd.addr.io.port = perf->control_register.address; cmd.addr.io.bit_width = perf->control_register.bit_width; break; @@ -268,8 +268,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy, if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) return 0; - ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); - per_cpu(old_perf, cpu) = perf; + ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); + per_cpu(acfreq_old_perf, cpu) = perf; retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; @@ -278,7 +278,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy, static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); unsigned int freq; unsigned int cached_freq; @@ -322,7 +322,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq, static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); struct acpi_processor_performance *perf; struct cpufreq_freqs freqs; struct drv_cmd cmd; @@ -416,7 +416,7 @@ out: static int acpi_cpufreq_verify(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_verify\n"); @@ -563,7 +563,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) return -ENOMEM; data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); - per_cpu(drv_data, cpu) = data; + per_cpu(acfreq_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; @@ -714,20 +714,20 @@ err_unreg: acpi_processor_unregister_performance(perf, cpu); err_free: kfree(data); - per_cpu(drv_data, cpu) = NULL; + per_cpu(acfreq_data, cpu) = NULL; return result; } static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_cpu_exit\n"); if (data) { cpufreq_frequency_table_put_attr(policy->cpu); - per_cpu(drv_data, policy->cpu) = NULL; + per_cpu(acfreq_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); kfree(data); @@ -738,7 +738,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_resume\n"); -- cgit v1.2.2 From c6e22f9e3e99cc221fe01a0cacf94a9da8a59c31 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:13 +0900 Subject: percpu: make percpu symbols in xen unique This patch updates percpu related symbols in xen such that percpu symbols are unique and don't clash with local symbols. This serves two purposes of decreasing the possibility of global percpu symbol collision and allowing dropping per_cpu__ prefix from percpu symbols. * arch/x86/xen/smp.c, arch/x86/xen/time.c, arch/ia64/xen/irq_xen.c: add xen_ prefix to percpu variables * arch/ia64/xen/time.c: add xen_ prefix to percpu variables, drop processed_ prefix and make them static Partly based on Rusty Russell's "alloc_percpu: rename percpu vars which cause name clashes" patch. Signed-off-by: Tejun Heo Cc: Rusty Russell Cc: Jeremy Fitzhardinge Cc: Chris Wright --- arch/x86/xen/smp.c | 41 +++++++++++++++++++++-------------------- arch/x86/xen/time.c | 24 ++++++++++++------------ 2 files changed, 33 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index fe03eeed7b48..1167d9830f5f 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -35,10 +35,10 @@ cpumask_var_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); -static DEFINE_PER_CPU(int, callfuncsingle_irq); -static DEFINE_PER_CPU(int, debug_irq) = -1; +static DEFINE_PER_CPU(int, xen_resched_irq); +static DEFINE_PER_CPU(int, xen_callfunc_irq); +static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); +static DEFINE_PER_CPU(int, xen_debug_irq) = -1; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -103,7 +103,7 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(resched_irq, cpu) = rc; + per_cpu(xen_resched_irq, cpu) = rc; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, @@ -114,7 +114,7 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(callfunc_irq, cpu) = rc; + per_cpu(xen_callfunc_irq, cpu) = rc; debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, @@ -122,7 +122,7 @@ static int xen_smp_intr_init(unsigned int cpu) debug_name, NULL); if (rc < 0) goto fail; - per_cpu(debug_irq, cpu) = rc; + per_cpu(xen_debug_irq, cpu) = rc; callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, @@ -133,19 +133,20 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(callfuncsingle_irq, cpu) = rc; + per_cpu(xen_callfuncsingle_irq, cpu) = rc; return 0; fail: - if (per_cpu(resched_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); - if (per_cpu(callfunc_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); - if (per_cpu(debug_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); - if (per_cpu(callfuncsingle_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + if (per_cpu(xen_resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + if (per_cpu(xen_callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + if (per_cpu(xen_debug_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), + NULL); return rc; } @@ -348,10 +349,10 @@ static void xen_cpu_die(unsigned int cpu) current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } - unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0a5aa44299a5..26e37b787ad3 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -31,14 +31,14 @@ #define NS_PER_TICK (1000000000LL / HZ) /* runstate info updated by Xen */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); /* snapshots of runstate info */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); /* unused ns of stolen and blocked time */ -static DEFINE_PER_CPU(u64, residual_stolen); -static DEFINE_PER_CPU(u64, residual_blocked); +static DEFINE_PER_CPU(u64, xen_residual_stolen); +static DEFINE_PER_CPU(u64, xen_residual_blocked); /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) @@ -79,7 +79,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) BUG_ON(preemptible()); - state = &__get_cpu_var(runstate); + state = &__get_cpu_var(xen_runstate); /* * The runstate info is always updated by the hypervisor on @@ -97,14 +97,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) /* return true when a vcpu could run but has no real cpu to run on */ bool xen_vcpu_stolen(int vcpu) { - return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; + return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; } static void setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; - area.addr.v = &per_cpu(runstate, cpu); + area.addr.v = &per_cpu(xen_runstate, cpu); if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area)) @@ -122,7 +122,7 @@ static void do_stolen_accounting(void) WARN_ON(state.state != RUNSTATE_running); - snap = &__get_cpu_var(runstate_snapshot); + snap = &__get_cpu_var(xen_runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; @@ -133,24 +133,24 @@ static void do_stolen_accounting(void) /* Add the appropriate number of ticks of stolen time, including any left-overs from last time. */ - stolen = runnable + offline + __get_cpu_var(residual_stolen); + stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); if (stolen < 0) stolen = 0; ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); - __get_cpu_var(residual_stolen) = stolen; + __get_cpu_var(xen_residual_stolen) = stolen; account_steal_ticks(ticks); /* Add the appropriate number of ticks of blocked time, including any left-overs from last time. */ - blocked += __get_cpu_var(residual_blocked); + blocked += __get_cpu_var(xen_residual_blocked); if (blocked < 0) blocked = 0; ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); - __get_cpu_var(residual_blocked) = blocked; + __get_cpu_var(xen_residual_blocked) = blocked; account_idle_ticks(ticks); } -- cgit v1.2.2 From 0fe1e009541e925adc1748a605d8b66188e4b2ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:14 +0900 Subject: percpu: make percpu symbols in x86 unique This patch updates percpu related symbols in x86 such that percpu symbols are unique and don't clash with local symbols. This serves two purposes of decreasing the possibility of global percpu symbol collision and allowing dropping per_cpu__ prefix from percpu symbols. * arch/x86/kernel/cpu/common.c: rename local variable to avoid collision * arch/x86/kvm/svm.c: s/svm_data/sd/ for local variables to avoid collision * arch/x86/kernel/cpu/cpu_debug.c: s/cpu_arr/cpud_arr/ s/priv_arr/cpud_priv_arr/ s/cpu_priv_count/cpud_priv_count/ * arch/x86/kernel/cpu/intel_cacheinfo.c: s/cpuid4_info/ici_cpuid4_info/ s/cache_kobject/ici_cache_kobject/ s/index_kobject/ici_index_kobject/ * arch/x86/kernel/ds.c: s/cpu_context/cpu_ds_context/ Partly based on Rusty Russell's "alloc_percpu: rename percpu vars which cause name clashes" patch. Signed-off-by: Tejun Heo Acked-by: (kvm) Avi Kivity Cc: Rusty Russell Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Marcelo Tosatti Cc: x86@kernel.org --- arch/x86/kernel/cpu/common.c | 8 ++--- arch/x86/kernel/cpu/cpu_debug.c | 30 ++++++++--------- arch/x86/kernel/cpu/intel_cacheinfo.c | 54 +++++++++++++++--------------- arch/x86/kernel/ds.c | 4 +-- arch/x86/kvm/svm.c | 63 +++++++++++++++++------------------ 5 files changed, 79 insertions(+), 80 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..3192f22f2fdd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1093,7 +1093,7 @@ static void clear_all_debug_regs(void) void __cpuinit cpu_init(void) { - struct orig_ist *orig_ist; + struct orig_ist *oist; struct task_struct *me; struct tss_struct *t; unsigned long v; @@ -1102,7 +1102,7 @@ void __cpuinit cpu_init(void) cpu = stack_smp_processor_id(); t = &per_cpu(init_tss, cpu); - orig_ist = &per_cpu(orig_ist, cpu); + oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (cpu != 0 && percpu_read(node_number) == 0 && @@ -1143,12 +1143,12 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!orig_ist->ist[0]) { + if (!oist->ist[0]) { char *estacks = per_cpu(exception_stacks, cpu); for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = + oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index dca325c03999..b368cd862997 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,9 +30,9 @@ #include #include -static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); -static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); -static DEFINE_PER_CPU(int, cpu_priv_count); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr); +static DEFINE_PER_CPU(int, cpud_priv_count); static DEFINE_MUTEX(cpu_debug_lock); @@ -531,7 +531,7 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, /* Already intialized */ if (file == CPU_INDEX_BIT) - if (per_cpu(cpu_arr[type].init, cpu)) + if (per_cpu(cpud_arr[type].init, cpu)) return 0; priv = kzalloc(sizeof(*priv), GFP_KERNEL); @@ -543,8 +543,8 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, priv->reg = reg; priv->file = file; mutex_lock(&cpu_debug_lock); - per_cpu(priv_arr[type], cpu) = priv; - per_cpu(cpu_priv_count, cpu)++; + per_cpu(cpud_priv_arr[type], cpu) = priv; + per_cpu(cpud_priv_count, cpu)++; mutex_unlock(&cpu_debug_lock); if (file) @@ -552,10 +552,10 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, dentry, (void *)priv, &cpu_fops); else { debugfs_create_file(cpu_base[type].name, S_IRUGO, - per_cpu(cpu_arr[type].dentry, cpu), + per_cpu(cpud_arr[type].dentry, cpu), (void *)priv, &cpu_fops); mutex_lock(&cpu_debug_lock); - per_cpu(cpu_arr[type].init, cpu) = 1; + per_cpu(cpud_arr[type].init, cpu) = 1; mutex_unlock(&cpu_debug_lock); } @@ -615,7 +615,7 @@ static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) if (!is_typeflag_valid(cpu, cpu_base[type].flag)) continue; cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); - per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry; + per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry; if (type < CPU_TSS_BIT) err = cpu_init_msr(cpu, type, cpu_dentry); @@ -647,11 +647,11 @@ static int cpu_init_cpu(void) err = cpu_init_allreg(cpu, cpu_dentry); pr_info("cpu%d(%d) debug files %d\n", - cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu)); - if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) { + cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu)); + if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) { pr_err("Register files count %d exceeds limit %d\n", - per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES); - per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES; + per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES); + per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES; err = -ENFILE; } if (err) @@ -676,8 +676,8 @@ static void __exit cpu_debug_exit(void) debugfs_remove_recursive(cpu_debugfs_dir); for (cpu = 0; cpu < nr_cpu_ids; cpu++) - for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++) - kfree(per_cpu(priv_arr[i], cpu)); + for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++) + kfree(per_cpu(cpud_priv_arr[i], cpu)); } module_init(cpu_debug_init); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 804c40e2bc3e..f5ccb4fa5a5d 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -512,8 +512,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) #ifdef CONFIG_SYSFS /* pointer to _cpuid4_info array (for each cache leaf) */ -static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); -#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) +static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); +#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) #ifdef CONFIG_SMP static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) @@ -526,7 +526,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { struct cpuinfo_x86 *d; for_each_online_cpu(i) { - if (!per_cpu(cpuid4_info, i)) + if (!per_cpu(ici_cpuid4_info, i)) continue; d = &cpu_data(i); this_leaf = CPUID4_INFO_IDX(i, index); @@ -548,7 +548,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) c->apicid >> index_msb) { cpumask_set_cpu(i, to_cpumask(this_leaf->shared_cpu_map)); - if (i != cpu && per_cpu(cpuid4_info, i)) { + if (i != cpu && per_cpu(ici_cpuid4_info, i)) { sibling_leaf = CPUID4_INFO_IDX(i, index); cpumask_set_cpu(cpu, to_cpumask( @@ -587,8 +587,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); - kfree(per_cpu(cpuid4_info, cpu)); - per_cpu(cpuid4_info, cpu) = NULL; + kfree(per_cpu(ici_cpuid4_info, cpu)); + per_cpu(ici_cpuid4_info, cpu) = NULL; } static int @@ -627,15 +627,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) if (num_cache_leaves == 0) return -ENOENT; - per_cpu(cpuid4_info, cpu) = kzalloc( + per_cpu(ici_cpuid4_info, cpu) = kzalloc( sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(cpuid4_info, cpu) == NULL) + if (per_cpu(ici_cpuid4_info, cpu) == NULL) return -ENOMEM; smp_call_function_single(cpu, get_cpu_leaves, &retval, true); if (retval) { - kfree(per_cpu(cpuid4_info, cpu)); - per_cpu(cpuid4_info, cpu) = NULL; + kfree(per_cpu(ici_cpuid4_info, cpu)); + per_cpu(ici_cpuid4_info, cpu) = NULL; } return retval; @@ -647,7 +647,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ /* pointer to kobject for cpuX/cache */ -static DEFINE_PER_CPU(struct kobject *, cache_kobject); +static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); struct _index_kobject { struct kobject kobj; @@ -656,8 +656,8 @@ struct _index_kobject { }; /* pointer to array of kobjects for cpuX/cache/indexY */ -static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); -#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) +static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); +#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ static ssize_t show_##file_name \ @@ -876,10 +876,10 @@ static struct kobj_type ktype_percpu_entry = { static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) { - kfree(per_cpu(cache_kobject, cpu)); - kfree(per_cpu(index_kobject, cpu)); - per_cpu(cache_kobject, cpu) = NULL; - per_cpu(index_kobject, cpu) = NULL; + kfree(per_cpu(ici_cache_kobject, cpu)); + kfree(per_cpu(ici_index_kobject, cpu)); + per_cpu(ici_cache_kobject, cpu) = NULL; + per_cpu(ici_index_kobject, cpu) = NULL; free_cache_attributes(cpu); } @@ -895,14 +895,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) return err; /* Allocate all required memory */ - per_cpu(cache_kobject, cpu) = + per_cpu(ici_cache_kobject, cpu) = kzalloc(sizeof(struct kobject), GFP_KERNEL); - if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) + if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL)) goto err_out; - per_cpu(index_kobject, cpu) = kzalloc( + per_cpu(ici_index_kobject, cpu) = kzalloc( sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); - if (unlikely(per_cpu(index_kobject, cpu) == NULL)) + if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL)) goto err_out; return 0; @@ -926,7 +926,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) if (unlikely(retval < 0)) return retval; - retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), + retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, &sys_dev->kobj, "%s", "cache"); if (retval < 0) { @@ -940,12 +940,12 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object->index = i; retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, - per_cpu(cache_kobject, cpu), + per_cpu(ici_cache_kobject, cpu), "index%1lu", i); if (unlikely(retval)) { for (j = 0; j < i; j++) kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); - kobject_put(per_cpu(cache_kobject, cpu)); + kobject_put(per_cpu(ici_cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); return retval; } @@ -953,7 +953,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); - kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); + kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD); return 0; } @@ -962,7 +962,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i; - if (per_cpu(cpuid4_info, cpu) == NULL) + if (per_cpu(ici_cpuid4_info, cpu) == NULL) return; if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) return; @@ -970,7 +970,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) for (i = 0; i < num_cache_leaves; i++) kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); - kobject_put(per_cpu(cache_kobject, cpu)); + kobject_put(per_cpu(ici_cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); } diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index ef42a038f1a6..1c47390dd0e5 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -265,13 +265,13 @@ struct ds_context { int cpu; }; -static DEFINE_PER_CPU(struct ds_context *, cpu_context); +static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context); static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 944cc9c04b3c..6c79a14a3b6f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -319,7 +319,7 @@ static void svm_hardware_disable(void *garbage) static void svm_hardware_enable(void *garbage) { - struct svm_cpu_data *svm_data; + struct svm_cpu_data *sd; uint64_t efer; struct descriptor_table gdt_descr; struct desc_struct *gdt; @@ -329,62 +329,61 @@ static void svm_hardware_enable(void *garbage) printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); return; } - svm_data = per_cpu(svm_data, me); + sd = per_cpu(svm_data, me); - if (!svm_data) { + if (!sd) { printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", me); return; } - svm_data->asid_generation = 1; - svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; - svm_data->next_asid = svm_data->max_asid + 1; + sd->asid_generation = 1; + sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; + sd->next_asid = sd->max_asid + 1; kvm_get_gdt(&gdt_descr); gdt = (struct desc_struct *)gdt_descr.base; - svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); + sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); rdmsrl(MSR_EFER, efer); wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, - page_to_pfn(svm_data->save_area) << PAGE_SHIFT); + page_to_pfn(sd->save_area) << PAGE_SHIFT); } static void svm_cpu_uninit(int cpu) { - struct svm_cpu_data *svm_data - = per_cpu(svm_data, raw_smp_processor_id()); + struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id()); - if (!svm_data) + if (!sd) return; per_cpu(svm_data, raw_smp_processor_id()) = NULL; - __free_page(svm_data->save_area); - kfree(svm_data); + __free_page(sd->save_area); + kfree(sd); } static int svm_cpu_init(int cpu) { - struct svm_cpu_data *svm_data; + struct svm_cpu_data *sd; int r; - svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); - if (!svm_data) + sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); + if (!sd) return -ENOMEM; - svm_data->cpu = cpu; - svm_data->save_area = alloc_page(GFP_KERNEL); + sd->cpu = cpu; + sd->save_area = alloc_page(GFP_KERNEL); r = -ENOMEM; - if (!svm_data->save_area) + if (!sd->save_area) goto err_1; - per_cpu(svm_data, cpu) = svm_data; + per_cpu(svm_data, cpu) = sd; return 0; err_1: - kfree(svm_data); + kfree(sd); return r; } @@ -1094,16 +1093,16 @@ static void save_host_msrs(struct kvm_vcpu *vcpu) #endif } -static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) +static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) { - if (svm_data->next_asid > svm_data->max_asid) { - ++svm_data->asid_generation; - svm_data->next_asid = 1; + if (sd->next_asid > sd->max_asid) { + ++sd->asid_generation; + sd->next_asid = 1; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } - svm->asid_generation = svm_data->asid_generation; - svm->vmcb->control.asid = svm_data->next_asid++; + svm->asid_generation = sd->asid_generation; + svm->vmcb->control.asid = sd->next_asid++; } static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) @@ -2377,8 +2376,8 @@ static void reload_tss(struct kvm_vcpu *vcpu) { int cpu = raw_smp_processor_id(); - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + sd->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); } @@ -2386,12 +2385,12 @@ static void pre_svm_run(struct vcpu_svm *svm) { int cpu = raw_smp_processor_id(); - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* FIXME: handle wraparound of asid_generation */ - if (svm->asid_generation != svm_data->asid_generation) - new_asid(svm, svm_data); + if (svm->asid_generation != sd->asid_generation) + new_asid(svm, sd); } static void svm_inject_nmi(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From dd17c8f72993f9461e9c19250e3f155d6d99df22 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: remove per_cpu__ prefix. Now that the return from alloc_percpu is compatible with the address of per-cpu vars, it makes sense to hand around the address of per-cpu variables. To make this sane, we remove the per_cpu__ prefix we used created to stop people accidentally using these vars directly. Now we have sparse, we can use that (next patch). tj: * Updated to convert stuff which were missed by or added after the original patch. * Kill per_cpu_var() macro. Signed-off-by: Rusty Russell Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter --- arch/x86/include/asm/percpu.h | 37 +++++++++++++++++-------------------- arch/x86/include/asm/system.h | 8 ++++---- arch/x86/kernel/apic/nmi.c | 6 +++--- arch/x86/kernel/head_32.S | 6 +++--- arch/x86/kernel/vmlinux.lds.S | 4 ++-- arch/x86/xen/xen-asm_32.S | 4 ++-- 6 files changed, 31 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 0c44196b78ac..4c170ccc72ed 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -25,19 +25,18 @@ */ #ifdef CONFIG_SMP #define PER_CPU(var, reg) \ - __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \ - lea per_cpu__##var(reg), reg -#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var + __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \ + lea var(reg), reg +#define PER_CPU_VAR(var) %__percpu_seg:var #else /* ! SMP */ -#define PER_CPU(var, reg) \ - __percpu_mov_op $per_cpu__##var, reg -#define PER_CPU_VAR(var) per_cpu__##var +#define PER_CPU(var, reg) __percpu_mov_op $var, reg +#define PER_CPU_VAR(var) var #endif /* SMP */ #ifdef CONFIG_X86_64_SMP #define INIT_PER_CPU_VAR(var) init_per_cpu__##var #else -#define INIT_PER_CPU_VAR(var) per_cpu__##var +#define INIT_PER_CPU_VAR(var) var #endif #else /* ...!ASSEMBLY */ @@ -60,12 +59,12 @@ * There also must be an entry in vmlinux_64.lds.S */ #define DECLARE_INIT_PER_CPU(var) \ - extern typeof(per_cpu_var(var)) init_per_cpu_var(var) + extern typeof(var) init_per_cpu_var(var) #ifdef CONFIG_X86_64_SMP #define init_per_cpu_var(var) init_per_cpu__##var #else -#define init_per_cpu_var(var) per_cpu_var(var) +#define init_per_cpu_var(var) var #endif /* For arch-specific code, we can use direct single-insn ops (they @@ -142,16 +141,14 @@ do { \ * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ - "m" (per_cpu__##var)) -#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ - "p" (&per_cpu__##var)) -#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) -#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) -#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) -#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) -#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) -#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) +#define percpu_read(var) percpu_from_op("mov", var, "m" (var)) +#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) +#define percpu_write(var, val) percpu_to_op("mov", var, val) +#define percpu_add(var, val) percpu_to_op("add", var, val) +#define percpu_sub(var, val) percpu_to_op("sub", var, val) +#define percpu_and(var, val) percpu_to_op("and", var, val) +#define percpu_or(var, val) percpu_to_op("or", var, val) +#define percpu_xor(var, val) percpu_to_op("xor", var, val) #define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -236,7 +233,7 @@ do { \ ({ \ int old__; \ asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ - : "=r" (old__), "+m" (per_cpu__##var) \ + : "=r" (old__), "+m" (var) \ : "dIr" (bit)); \ old__; \ }) diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index f08f97374892..de10c19d9558 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -31,7 +31,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, "movl %P[task_canary](%[next]), %%ebx\n\t" \ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" #define __switch_canary_oparam \ - , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) + , [stack_canary] "=m" (stack_canary.canary) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -113,7 +113,7 @@ do { \ "movq %P[task_canary](%%rsi),%%r8\n\t" \ "movq %%r8,"__percpu_arg([gs_canary])"\n\t" #define __switch_canary_oparam \ - , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) + , [gs_canary] "=m" (irq_stack_union.stack_canary) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -134,7 +134,7 @@ do { \ __switch_canary \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ "jnz ret_from_fork\n\t" \ RESTORE_CONTEXT \ : "=a" (last) \ @@ -144,7 +144,7 @@ do { \ [ti_flags] "i" (offsetof(struct thread_info, flags)), \ [_tif_fork] "i" (_TIF_FORK), \ [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (per_cpu_var(current_task)) \ + [current_task] "m" (current_task) \ __switch_canary_iparam \ : "memory", "cc" __EXTRA_CLOBBER) #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index e631cc4416f7..45404379d173 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -437,8 +437,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - __this_cpu_inc(per_cpu_var(alert_counter)); - if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) + __this_cpu_inc(alert_counter); + if (__this_cpu_read(alert_counter) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -446,7 +446,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(per_cpu_var(alert_counter), 0); + __this_cpu_write(alert_counter, 0); } /* see if the nmi watchdog went off */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 050c278481b1..fd39eaf83b84 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -438,8 +438,8 @@ is386: movl $2,%ecx # set MP */ cmpb $0,ready jne 1f - movl $per_cpu__gdt_page,%eax - movl $per_cpu__stack_canary,%ecx + movl $gdt_page,%eax + movl $stack_canary,%ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -702,7 +702,7 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ + .long gdt_page /* Overwritten for secondary CPUs */ /* * The boot_gdt must mirror the equivalent in setup.S and is diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9fa..ecb92717c412 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -312,7 +312,7 @@ SECTIONS * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); @@ -323,7 +323,7 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((per_cpu__irq_stack_union == 0), +. = ASSERT((irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 88e15deb8b82..22a2093b5862 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -90,9 +90,9 @@ ENTRY(xen_iret) GET_THREAD_INFO(%eax) movl TI_cpu(%eax), %eax movl __per_cpu_offset(,%eax,4), %eax - mov per_cpu__xen_vcpu(%eax), %eax + mov xen_vcpu(%eax), %eax #else - movl per_cpu__xen_vcpu, %eax + movl xen_vcpu, %eax #endif /* check IF state we're restoring */ -- cgit v1.2.2 From 2d06ef7f42ed8c9969c9aa84e95df5d5c6378327 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 1 Nov 2009 12:49:44 -0500 Subject: crypto: ghash-intel - Hard-code pshufb Old gases don't have a clue what pshufb stands for so we have to hard-code it for now. Reported-by: Andrew Morton Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_asm.S | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index b9e787a511da..71768d543dbb 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -100,9 +100,11 @@ ENTRY(clmul_ghash_mul) movups (%rdi), DATA movups (%rsi), SHASH movaps .Lbswap_mask, BSWAP - pshufb BSWAP, DATA + # pshufb BSWAP, DATA + .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 call __clmul_gf128mul_ble - pshufb BSWAP, DATA + # pshufb BSWAP, DATA + .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 movups DATA, (%rdi) ret @@ -116,18 +118,21 @@ ENTRY(clmul_ghash_update) movaps .Lbswap_mask, BSWAP movups (%rdi), DATA movups (%rcx), SHASH - pshufb BSWAP, DATA + # pshufb BSWAP, DATA + .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 .align 4 .Lupdate_loop: movups (%rsi), IN1 - pshufb BSWAP, IN1 + # pshufb BSWAP, IN1 + .byte 0x66, 0x0f, 0x38, 0x00, 0xf5 pxor IN1, DATA call __clmul_gf128mul_ble sub $16, %rdx add $16, %rsi cmp $16, %rdx jge .Lupdate_loop - pshufb BSWAP, DATA + # pshufb BSWAP, DATA + .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 movups DATA, (%rdi) .Lupdate_just_ret: ret @@ -140,7 +145,8 @@ ENTRY(clmul_ghash_update) ENTRY(clmul_ghash_setkey) movaps .Lbswap_mask, BSWAP movups (%rsi), %xmm0 - pshufb BSWAP, %xmm0 + # pshufb BSWAP, %xmm0 + .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 movaps %xmm0, %xmm1 psllq $1, %xmm0 psrlq $63, %xmm1 -- cgit v1.2.2 From 7a7732bc0f7c46f217dbec723f25366b6285cc42 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:31 -0800 Subject: x86: Unify fixup_irqs() for 32-bit and 64-bit kernels There is no reason to have different fixup_irqs() for 32-bit and 64-bit kernels. Unify by using the superior 64-bit version for both the kernels. Signed-off-by: Suresh Siddha Signed-off-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230001.562512739@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irq_32.c | 45 ------------------------------------ arch/x86/kernel/irq_64.c | 58 ----------------------------------------------- 3 files changed, 59 insertions(+), 103 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 391206199515..3ea66556e5e1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -276,3 +276,62 @@ void smp_generic_interrupt(struct pt_regs *regs) } EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); + +#ifdef CONFIG_HOTPLUG_CPU +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq; + static int warned; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + spin_lock(&desc->lock); + + affinity = desc->affinity; + if (!irq_has_action(irq) || + cpumask_equal(affinity, cpu_online_mask)) { + spin_unlock(&desc->lock); + continue; + } + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + if (desc->chip->mask) + desc->chip->mask(irq); + + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (!(warned++)) + set_affinity = 0; + + if (desc->chip->unmask) + desc->chip->unmask(irq); + + spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + printk("Broke affinity for irq %i\n", irq); + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 7d35d0fe2329..10709f29d166 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } - -#ifdef CONFIG_HOTPLUG_CPU - -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - affinity = desc->affinity; - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - printk("Breaking affinity for irq %i\n", irq); - affinity = cpu_all_mask; - } - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (desc->action) - printk_once("Cannot set affinity for irq %i\n", irq); - } - -#if 0 - barrier(); - /* Ingo Molnar says: "after the IO-APIC masks have been redirected - [note the nop - the interrupt-enable boundary on x86 is two - instructions from sti] - to flush out pending hardirqs and - IPIs. After this point nothing is supposed to reach this CPU." */ - __asm__ __volatile__("sti; nop; cli"); - barrier(); -#else - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -#endif -} -#endif - diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 977d8b43a0dd..acf8fbf8fbda 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } -#ifdef CONFIG_HOTPLUG_CPU -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - static int warned; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - int break_affinity = 0; - int set_affinity = 1; - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); - - affinity = desc->affinity; - if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); - continue; - } - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - break_affinity = 1; - affinity = cpu_all_mask; - } - - if (desc->chip->mask) - desc->chip->mask(irq); - - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (!(warned++)) - set_affinity = 0; - - if (desc->chip->unmask) - desc->chip->unmask(irq); - - spin_unlock(&desc->lock); - - if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); - } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif extern void call_softirq(void); -- cgit v1.2.2 From 84e21493a3b28c9fefe99fe827fc0c0c101a813d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:32 -0800 Subject: x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping In the presence of interrupt-remapping, irqs will be migrated in the process context and we don't do (and there is no need to) irq_chip mask/unmask while migrating the interrupt. Similarly fix the fixup_irqs() that get called during cpu offline and avoid calling irq_chip mask/unmask for irqs that are ok to be migrated in the process context. While we didn't observe any race condition with the existing code, this change takes complete advantage of interrupt-remapping in the newer generation platforms and avoids any potential HW lockup's (that often worry Eric :) Signed-off-by: Suresh Siddha Acked-by: Eric W. Biederman Cc: garyhade@us.ibm.com LKML-Reference: <20091026230001.661423939@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3ea66556e5e1..342bcbca19b4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -310,7 +310,7 @@ void fixup_irqs(void) affinity = cpu_all_mask; } - if (desc->chip->mask) + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) desc->chip->mask(irq); if (desc->chip->set_affinity) @@ -318,7 +318,7 @@ void fixup_irqs(void) else if (!(warned++)) set_affinity = 0; - if (desc->chip->unmask) + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) desc->chip->unmask(irq); spin_unlock(&desc->lock); -- cgit v1.2.2 From 23359a88e7eca3c4f402562b102f23014db3c2aa Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:33 -0800 Subject: x86: Remove move_cleanup_count from irq_cfg move_cleanup_count for each irq in irq_cfg is keeping track of the total number of cpus that need to free the corresponding vectors associated with the irq which has now been migrated to new destination. As long as this move_cleanup_count is non-zero (i.e., as long as we have n't freed the vector allocations on the old destinations) we were preventing the irq's further migration. This cleanup count is unnecessary and it is enough to not allow the irq migration till we send the cleanup vector to the previous irq destination, for which we already have irq_cfg's move_in_progress. All we need to make sure is that we free the vector at the old desintation but we don't need to wait till that gets freed. Signed-off-by: Suresh Siddha Acked-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230001.752968906@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 1 - arch/x86/kernel/apic/io_apic.c | 9 +-------- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 1984ce9a13d2..6e124269fd4b 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -94,7 +94,6 @@ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; cpumask_var_t old_domain; - unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ce16b65cfdcc..e9e5b02c3af2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1161,7 +1161,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int cpu, err; cpumask_var_t tmp_mask; - if ((cfg->move_in_progress) || cfg->move_cleanup_count) + if (cfg->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -2234,14 +2234,10 @@ void send_cleanup_vector(struct irq_cfg *cfg) if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } @@ -2430,8 +2426,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2449,7 +2443,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) goto unlock; } __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; unlock: spin_unlock(&desc->lock); } -- cgit v1.2.2 From a5e74b841930bec78a4684ab9f208b2ddfe7c736 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:34 -0800 Subject: x86: Force irq complete move during cpu offline When a cpu goes offline, fixup_irqs() try to move irq's currently destined to the offline cpu to a new cpu. But this attempt will fail if the irq is recently moved to this cpu and the irq still hasn't arrived at this cpu (for non intr-remapping platforms this is when we free the vector allocation at the previous destination) that is about to go offline. This will endup with the interrupt subsystem still pointing the irq to the offline cpu, causing that irq to not work any more. Fix this by forcing the irq to complete its move (its been a long time we moved the irq to this cpu which we are offlining now) and then move this irq to a new cpu before this cpu goes offline. Signed-off-by: Suresh Siddha Acked-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230001.848830905@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq.h | 1 + arch/x86/kernel/apic/io_apic.c | 18 +++++++++++++++--- arch/x86/kernel/irq.c | 7 +++++++ 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index ddda6cbed6f4..ffd700ff5dcb 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -34,6 +34,7 @@ static inline int irq_canonicalize(int irq) #ifdef CONFIG_HOTPLUG_CPU #include extern void fixup_irqs(void); +extern void irq_force_complete_move(int); #endif extern void (*generic_interrupt_extension)(void); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e9e5b02c3af2..4e886efd9a15 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2450,21 +2450,33 @@ unlock: irq_exit(); } -static void irq_complete_move(struct irq_desc **descp) +static void __irq_complete_move(struct irq_desc **descp, unsigned vector) { struct irq_desc *desc = *descp; struct irq_cfg *cfg = desc->chip_data; - unsigned vector, me; + unsigned me; if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); } + +static void irq_complete_move(struct irq_desc **descp) +{ + __irq_complete_move(descp, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + + __irq_complete_move(&desc, cfg->vector); +} #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 342bcbca19b4..b10a5e1da06c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -305,6 +305,13 @@ void fixup_irqs(void) continue; } + /* + * Complete the irq move. This cpu is going down and for + * non intr-remapping case, we can't wait till this interrupt + * arrives at this cpu before completing the irq move. + */ + irq_force_complete_move(irq); + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; affinity = cpu_all_mask; -- cgit v1.2.2 From b3ec0a37a7907813bb4fb85a2d94102c152470b7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:35 -0800 Subject: x86: Use EOI register in io-apic on intel platforms IO-APIC's in intel chipsets support EOI register starting from IO-APIC version 2. Use that when ever we need to clear the IO-APIC RTE's RemoteIRR bit explicitly. Signed-off-by: Suresh Siddha Acked-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230001.947855317@sbs-t61.sc.intel.com> [ Marked use_eio_reg as __read_mostly, fixed small details ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 81 ++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4e886efd9a15..31e9db3c12ad 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2492,6 +2492,51 @@ static void ack_apic_edge(unsigned int irq) atomic_t irq_mis_count; +static int use_eoi_reg __read_mostly; + +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } +} + +static void eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static int ioapic_supports_eoi(void) +{ + struct pci_dev *root; + + root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0)); + if (root && root->vendor == PCI_VENDOR_ID_INTEL && + mp_ioapics[0].apicver >= 0x2) { + use_eoi_reg = 1; + printk(KERN_INFO "IO-APIC supports EOI register\n"); + } else + printk(KERN_INFO "IO-APIC doesn't support EOI\n"); + + return 0; +} + +fs_initcall(ioapic_supports_eoi); + static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2575,37 +2620,19 @@ static void ack_apic_level(unsigned int irq) /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); + + if (use_eoi_reg) + eoi_ioapic_irq(desc); + else { + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(cfg); + __unmask_and_level_IO_APIC_irq(cfg); + spin_unlock(&ioapic_lock); + } } } #ifdef CONFIG_INTR_REMAP -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, cfg->irq_2_pin) - io_apic_eoi(entry->apic, entry->pin); -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); -- cgit v1.2.2 From 5231a68614b94f60e8f6c56bc6e3d75955b9e75e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:36 -0800 Subject: x86: Remove local_irq_enable()/local_irq_disable() in fixup_irqs() To ensure that we handle all the pending interrupts (destined for this cpu that is going down) in the interrupt subsystem before the cpu goes offline, fixup_irqs() does: local_irq_enable(); mdelay(1); local_irq_disable(); Enabling interrupts is not a good thing as this cpu is already offline. So this patch replaces that logic with, mdelay(1); check APIC_IRR bits Retrigger the irq at the new destination if any interrupt has arrived via IPI. For IO-APIC level triggered interrupts, this retrigger IPI will appear as an edge interrupt. ack_apic_level() will detect this condition and IO-APIC RTE's remoteIRR is cleared using directed EOI(using IO-APIC EOI register) on Intel platforms and for others it uses the existing mask+edge logic followed by unmask+level. We can also remove mdelay() and then send spuriuous interrupts to new cpu targets for all the irqs that were handled previously by this cpu that is going offline. While it works, I have seen spurious interrupt messages (nothing wrong but still annoying messages during cpu offline, which can be seen during suspend/resume etc) Signed-off-by: Suresh Siddha Acked-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230002.043281924@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b10a5e1da06c..8a82728d47c1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -281,7 +281,7 @@ EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irq; + unsigned int irq, vector; static int warned; struct irq_desc *desc; @@ -336,9 +336,33 @@ void fixup_irqs(void) printk("Cannot set affinity for irq %i\n", irq); } - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ mdelay(1); - local_irq_disable(); + + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irr; + + if (__get_cpu_var(vector_irq)[vector] < 0) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + spin_lock(&desc->lock); + if (desc->chip->retrigger) + desc->chip->retrigger(irq); + spin_unlock(&desc->lock); + } + } } #endif -- cgit v1.2.2 From 502f660466ba7a66711ffdf414b1f7f1131dcbf7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 28 Oct 2009 18:46:56 -0800 Subject: x86, cpa: Fix kernel text RO checks in static_protection() Steven Rostedt reported that we are unconditionally making the kernel text mapping as read-only. i.e., if someone does cpa() to the kernel text area for setting/clearing any page table attribute, we unconditionally clear the read-write attribute for the kernel text mapping that is set at compile time. We should delay (to forbid the write attribute) and enforce only after the kernel has mapped the text as read-only. Reported-by: Steven Rostedt Signed-off-by: Suresh Siddha Acked-by: Steven Rostedt Tested-by: Steven Rostedt LKML-Reference: <20091029024820.996634347@sbs-t61.sc.intel.com> [ marked kernel_set_to_readonly as __read_mostly ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cacheflush.h | 1 + arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/pageattr.c | 10 ++++++---- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index b54f6afe7ec4..eebb2cd2b9bf 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -176,6 +176,7 @@ void clflush_cache_range(void *addr, unsigned int size); #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; +extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); #else diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f64d0d5e0f89..c973f8e2a6cf 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0ed09fad6aa1..4b507c089402 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -695,7 +695,7 @@ void __init mem_init(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly; void set_kernel_text_rw(void) { diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 78d3168b3c64..8d1e8d95ea45 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -282,14 +282,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && \ !defined(CONFIG_DYNAMIC_FTRACE) /* - * Kernel text mappings for the large page aligned .rodata section - * will be read-only. For the kernel identity mappings covering - * the holes caused by this alignment can be anything. + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything that user asks. * * This will preserve the large page mappings for kernel text/data * at no extra cost. */ - if (within(address, (unsigned long)_text, + if (kernel_set_to_readonly && + within(address, (unsigned long)_text, (unsigned long)__end_rodata_hpage_align)) pgprot_val(forbidden) |= _PAGE_RW; #endif -- cgit v1.2.2 From 55ca3cc1746335bb6ef1d3894ddb6d0c729b3518 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 28 Oct 2009 18:46:57 -0800 Subject: x86_64, ftrace: Make ftrace use kernel identity mapping to modify code On x86_64, kernel text mappings are mapped read-only with CONFIG_DEBUG_RODATA. So use the kernel identity mapping instead of the kernel text mapping to modify the kernel text. Signed-off-by: Suresh Siddha Acked-by: Steven Rostedt Tested-by: Steven Rostedt LKML-Reference: <20091029024821.080941108@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 17 +++++++++++++++++ arch/x86/mm/pageattr.c | 3 +-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9dbb527e1652..944e9820b4b5 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -187,9 +187,26 @@ static void wait_for_nmi(void) nmi_wait_count++; } +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + static int do_ftrace_mod_code(unsigned long ip, void *new_code) { + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + mod_code_ip = (void *)ip; mod_code_newcode = new_code; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8d1e8d95ea45..09a140ca7be8 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -279,8 +279,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && \ - !defined(CONFIG_DYNAMIC_FTRACE) +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections -- cgit v1.2.2 From e7d23dde9b7ebb575e2bcee2abefc9ec1e4adde9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 28 Oct 2009 18:46:58 -0800 Subject: x86_64, cpa: Use only text section in set_kernel_text_rw/ro set_kernel_text_rw()/set_kernel_text_ro() are marking pages starting from _text to __start_rodata as RW or RO. With CONFIG_DEBUG_RODATA, there might be free pages (associated with padding the sections to 2MB large page boundary) between text and rodata sections that are given back to page allocator. So we should use only use the start (__text) and end (__stop___ex_table) of the text section in set_kernel_text_rw()/set_kernel_text_ro(). Signed-off-by: Suresh Siddha Acked-by: Steven Rostedt Tested-by: Steven Rostedt LKML-Reference: <20091029024821.164525222@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4b507c089402..5198b9bb34ef 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -700,7 +700,7 @@ int kernel_set_to_readonly; void set_kernel_text_rw(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -708,13 +708,18 @@ void set_kernel_text_rw(void) pr_debug("Set kernel text: %lx - %lx for read write\n", start, end); + /* + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c + */ set_memory_rw(start, (end - start) >> PAGE_SHIFT); } void set_kernel_text_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -722,6 +727,9 @@ void set_kernel_text_ro(void) pr_debug("Set kernel text: %lx - %lx for read only\n", start, end); + /* + * Set the kernel identity mapping for text RO. + */ set_memory_ro(start, (end - start) >> PAGE_SHIFT); } -- cgit v1.2.2 From 3b0d65969b549b796abc6f0230f6142fed365d49 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 3 Nov 2009 09:11:15 -0500 Subject: crypto: ghash-intel - Add PSHUFB macros Add PSHUFB macros instead of repeating byte sequences, suggested by Ingo. Signed-off-by: Herbert Xu Acked-by: Ingo Molnar --- arch/x86/crypto/ghash-clmulni-intel_asm.S | 11 ++++++----- arch/x86/include/asm/i387.h | 7 +++++++ 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 71768d543dbb..59584982fb75 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -17,6 +17,7 @@ */ #include +#include .align 16 .Lbswap_mask: @@ -101,7 +102,7 @@ ENTRY(clmul_ghash_mul) movups (%rsi), SHASH movaps .Lbswap_mask, BSWAP # pshufb BSWAP, DATA - .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 + PSHUFB_XMM5_XMM0 call __clmul_gf128mul_ble # pshufb BSWAP, DATA .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 @@ -119,12 +120,12 @@ ENTRY(clmul_ghash_update) movups (%rdi), DATA movups (%rcx), SHASH # pshufb BSWAP, DATA - .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 + PSHUFB_XMM5_XMM0 .align 4 .Lupdate_loop: movups (%rsi), IN1 # pshufb BSWAP, IN1 - .byte 0x66, 0x0f, 0x38, 0x00, 0xf5 + PSHUFB_XMM5_XMM6 pxor IN1, DATA call __clmul_gf128mul_ble sub $16, %rdx @@ -132,7 +133,7 @@ ENTRY(clmul_ghash_update) cmp $16, %rdx jge .Lupdate_loop # pshufb BSWAP, DATA - .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 + PSHUFB_XMM5_XMM0 movups DATA, (%rdi) .Lupdate_just_ret: ret @@ -146,7 +147,7 @@ ENTRY(clmul_ghash_setkey) movaps .Lbswap_mask, BSWAP movups (%rsi), %xmm0 # pshufb BSWAP, %xmm0 - .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 + PSHUFB_XMM5_XMM0 movaps %xmm0, %xmm1 psllq $1, %xmm0 psrlq $63, %xmm1 diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 0b20bbb758f2..ebfb8a9e11f7 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -10,6 +10,8 @@ #ifndef _ASM_X86_I387_H #define _ASM_X86_I387_H +#ifndef __ASSEMBLY__ + #include #include #include @@ -411,4 +413,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) } } +#endif /* __ASSEMBLY__ */ + +#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 +#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5 + #endif /* _ASM_X86_I387_H */ -- cgit v1.2.2 From a489ca355efaf9efa4990b0f8f30ab650a206273 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 2 Nov 2009 16:59:15 -0800 Subject: x86: Make sure we also print a Code: line for show_regs() show_regs() is called as a mini BUG() equivalent in some places, specifically for the "scheduling while atomic" case. Unfortunately right now it does not print a Code: line unlike a real bug/oops. This patch changes the x86 implementation of show_regs() so that it calls the same function as oopses do to print the registers as well as the Code: line. Signed-off-by: Arjan van de Ven LKML-Reference: <20091102165915.4a980fc0@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf79567cdab..e658331edb6d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -187,7 +187,7 @@ void __show_regs(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - __show_regs(regs, 1); + show_registers(regs); show_trace(NULL, regs, ®s->sp, regs->bp); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..2386999bfcd2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -226,8 +226,7 @@ void __show_regs(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - printk(KERN_INFO "CPU %d:", smp_processor_id()); - __show_regs(regs, 1); + show_registers(regs); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } -- cgit v1.2.2 From 01dd95827726534230d8f03f7e6faafe24e49260 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 3 Nov 2009 10:55:20 -0500 Subject: crypto: ghash-intel - Fix irq_fpu_usable usage When renaming kernel_fpu_using to irq_fpu_usable, the semantics of the function is changed too, from mesuring whether kernel is using FPU, that is, the FPU is NOT available, to measuring whether FPU is usable, that is, the FPU is available. But the usage of irq_fpu_usable in ghash-clmulni-intel_glue.c is not changed accordingly. This patch fixes this. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_glue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 65d409644d72..cbcc8d8ea93a 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -159,7 +159,7 @@ static int ghash_async_init(struct ahash_request *req) struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (irq_fpu_usable()) { + if (!irq_fpu_usable()) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_init(cryptd_req); @@ -177,7 +177,7 @@ static int ghash_async_update(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); - if (irq_fpu_usable()) { + if (!irq_fpu_usable()) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; @@ -195,7 +195,7 @@ static int ghash_async_final(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); - if (irq_fpu_usable()) { + if (!irq_fpu_usable()) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; @@ -216,7 +216,7 @@ static int ghash_async_digest(struct ahash_request *req) struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (irq_fpu_usable()) { + if (!irq_fpu_usable()) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); return crypto_ahash_digest(cryptd_req); -- cgit v1.2.2 From 41a48d14f6991020c9bb6b93e289ca5b411ed09a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 5 Oct 2009 19:23:06 +0900 Subject: x86/hw-breakpoints: Actually flush thread breakpoints in flush_thread(). flush_thread() tries to do a TIF_DEBUG check before calling in to flush_thread_hw_breakpoint() (which subsequently clears the thread flag), but for some reason, the x86 code is manually clearing TIF_DEBUG immediately before the test, so this path will never be taken. This kills off the erroneous clear_tsk_thread_flag() and lets flush_thread_hw_breakpoint() actually get invoked. Presumably folks were getting lucky with testing and the free_thread_info() -> free_thread_xstate() path was taking care of the flush there. Signed-off-by: Paul Mundt Acked-by: "K.Prasad" Cc: Ingo Molnar Cc: Alan Stern LKML-Reference: <20091005102306.GA7889@linux-sh.org> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2275ce5776de..cf8ee0016307 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -107,8 +107,6 @@ void flush_thread(void) } #endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) flush_thread_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -- cgit v1.2.2 From 97829de5a3b88899c5f3ac8802d11868bf4180ba Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 3 Nov 2009 14:02:05 -0500 Subject: x86, 64-bit: Fix bstep_iret jump This jump should be unconditional. Signed-off-by: Brian Gerst LKML-Reference: <1257274925-15713-1-git-send-email-brgerst@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index af0f4b226dbe..1579a6c59cfd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1501,7 +1501,7 @@ error_kernelspace: bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) - je error_swapgs + jmp error_swapgs END(error_entry) -- cgit v1.2.2 From 09879b99d44d701c603935ef2549004405d7f8f9 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 12:58:15 +0900 Subject: x86: Gitignore: arch/x86/lib/inat-tables.c Ignore generated file arch/x86/lib/inat-tables.c. Signed-off-by: Hiroshi Shimamoto Acked-by: Masami Hiramatsu LKML-Reference: <4AF0FBD7.7000501@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/x86/lib/.gitignore (limited to 'arch/x86') diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore new file mode 100644 index 000000000000..8df89f0a3fe6 --- /dev/null +++ b/arch/x86/lib/.gitignore @@ -0,0 +1 @@ +inat-tables.c -- cgit v1.2.2 From ce7c42710e2dd133f10b7fc9ed9c73bdd2435f7a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:53:52 +1030 Subject: cpumask: Avoid cpumask_t in arch/x86/kernel/apic/nmi.c Ingo wants the certainty of a static cpumask (rather than a cpumask_var_t), but cpumask_t will some day be undefined to avoid on-stack declarations. This is what DECLARE_BITMAP/to_cpumask() is for. Signed-off-by: Rusty Russell LKML-Reference: <200911031453.52394.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 7ff61d6a188a..6389432a9dbf 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,8 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_t backtrace_mask __read_mostly; +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (cpumask_test_cpu(cpu, &backtrace_mask)) { + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); @@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, &backtrace_mask); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); rc = 1; } @@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(&backtrace_mask, cpu_online_mask); + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); printk(KERN_INFO "sending NMI to all CPUs:\n"); apic->send_IPI_all(NMI_VECTOR); /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(&backtrace_mask)) + if (cpumask_empty(to_cpumask(backtrace_mask))) break; mdelay(1); } -- cgit v1.2.2 From 6ac5c5310ca9d7dd3d7e677c2715b1f06a348330 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:58:30 +1030 Subject: cpumask: Use modern cpumask style in arch/x86/kernel/cpu/mcheck/mce-inject.c Note that there's no freeing the cpu var, since this module has no unload function. Signed-off-by: Rusty Russell Cc: Andi Kleen Cc: Huang Ying LKML-Reference: <200911031458.30987.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 472763d92098..73734baa50f2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) m->finished = 0; } -static cpumask_t mce_inject_cpumask; +static cpumask_var_t mce_inject_cpumask; static int mce_raise_notify(struct notifier_block *self, unsigned long val, void *data) @@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self, struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) return NOTIFY_DONE; - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) raise_exception(m, args->regs); else if (m->status) @@ -148,22 +148,22 @@ static void raise_mce(struct mce *m) unsigned long start; int cpu; get_online_cpus(); - mce_inject_cpumask = cpu_online_map; - cpu_clear(get_cpu(), mce_inject_cpumask); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); for_each_online_cpu(cpu) { struct mce *mcpu = &per_cpu(injectm, cpu); if (!mcpu->finished || MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpus_empty(mce_inject_cpumask)) - apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) + apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); start = jiffies; - while (!cpus_empty(mce_inject_cpumask)) { + while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR "Timeout waiting for mce inject NMI %lx\n", - *cpus_addr(mce_inject_cpumask)); + *cpumask_bits(mce_inject_cpumask)); break; } cpu_relax(); @@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, static int inject_init(void) { + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; register_die_notifier(&mce_raise_nb); -- cgit v1.2.2 From d7d3756c5b1277fafd132ce7a2211b388c3b5bd2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:58:38 +1030 Subject: cpumask: Use modern cpumask style in Xen Signed-off-by: Rusty Russell Cc: Jeremy Fitzhardinge LKML-Reference: <200911031458.38406.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index fe03eeed7b48..738da0cb0d8b 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -73,7 +73,7 @@ static __cpuinit void cpu_bringup(void) xen_setup_cpu_clockevents(); - cpu_set(cpu, cpu_online_map); + set_cpu_online(cpu, true); percpu_write(cpu_state, CPU_ONLINE); wmb(); -- cgit v1.2.2 From 99935a7a59eaca0292c1a5880e10bae03f4a5e3d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 4 Oct 2009 21:54:24 -0700 Subject: x86/PCI: read root resources from IOH on Intel For intel systems with multi IOH, we should read peer root resources directly from PCI config space, and don't trust _CRS. Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/Makefile | 1 + arch/x86/pci/amd_bus.c | 45 ++++++++++-------------- arch/x86/pci/bus_numa.h | 26 ++++++++++++++ arch/x86/pci/intel_bus.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 27 deletions(-) create mode 100644 arch/x86/pci/bus_numa.h create mode 100644 arch/x86/pci/intel_bus.c (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index d49202e740ea..56d917b556c6 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -15,3 +15,4 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o obj-y += amd_bus.o +obj-$(CONFIG_X86_64) += intel_bus.o diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 572ee9782f2a..995f36096a42 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -10,6 +10,8 @@ #include #endif +#include "bus_numa.h" + /* * This discovers the pcibus <-> node mapping on AMD K8. * also get peer root bus resource for io,mmio @@ -17,25 +19,9 @@ #ifdef CONFIG_X86_64 -/* - * sub bus (transparent) will use entres from 3 to store extra from root, - * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? - */ -#define RES_NUM 16 -struct pci_root_info { - char name[12]; - unsigned int res_num; - struct resource res[RES_NUM]; - int bus_min; - int bus_max; - int node; - int link; -}; - -/* 4 at this time, it may become to 32 */ -#define PCI_ROOT_NR 4 -static int pci_root_num; -static struct pci_root_info pci_root_info[PCI_ROOT_NR]; +int pci_root_num; +struct pci_root_info pci_root_info[PCI_ROOT_NR]; +static int found_all_numa_early; void x86_pci_root_bus_res_quirks(struct pci_bus *b) { @@ -48,8 +34,11 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) b->resource[1] != &iomem_resource) return; - /* if only one root bus, don't need to anything */ - if (pci_root_num < 2) + if (!pci_root_num) + return; + + /* for amd, if only one root bus, don't need to do anything */ + if (pci_root_num < 2 && found_all_numa_early) return; for (i = 0; i < pci_root_num; i++) { @@ -130,12 +119,15 @@ static void __init update_range(struct res_range *range, size_t start, } } -static void __init update_res(struct pci_root_info *info, size_t start, +void __init update_res(struct pci_root_info *info, size_t start, size_t end, unsigned long flags, int merge) { int i; struct resource *res; + if (start > end) + return; + if (!merge) goto addit; @@ -230,7 +222,6 @@ static int __init early_fill_mp_bus_info(void) int j; unsigned bus; unsigned slot; - int found; int node; int link; int def_node; @@ -247,7 +238,7 @@ static int __init early_fill_mp_bus_info(void) if (!early_pci_allowed()) return -1; - found = 0; + found_all_numa_early = 0; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { u32 id; u16 device; @@ -261,12 +252,12 @@ static int __init early_fill_mp_bus_info(void) device = (id>>16) & 0xffff; if (pci_probes[i].vendor == vendor && pci_probes[i].device == device) { - found = 1; + found_all_numa_early = 1; break; } } - if (!found) + if (!found_all_numa_early) return 0; pci_root_num = 0; @@ -488,7 +479,7 @@ static int __init early_fill_mp_bus_info(void) info = &pci_root_info[i]; res_num = info->res_num; busnum = info->bus_min; - printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", + printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", info->bus_min, info->bus_max, info->node, info->link); for (j = 0; j < res_num; j++) { res = &info->res[j]; diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h new file mode 100644 index 000000000000..4ff126a3e887 --- /dev/null +++ b/arch/x86/pci/bus_numa.h @@ -0,0 +1,26 @@ +#ifdef CONFIG_X86_64 + +/* + * sub bus (transparent) will use entres from 3 to store extra from + * root, so need to make sure we have enought slot there, Should we + * increase PCI_BUS_NUM_RESOURCES? + */ +#define RES_NUM 16 +struct pci_root_info { + char name[12]; + unsigned int res_num; + struct resource res[RES_NUM]; + int bus_min; + int bus_max; + int node; + int link; +}; + +/* 4 at this time, it may become to 32 */ +#define PCI_ROOT_NR 4 +extern int pci_root_num; +extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; + +extern void update_res(struct pci_root_info *info, size_t start, + size_t end, unsigned long flags, int merge); +#endif diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c new file mode 100644 index 000000000000..b7a55dc55d13 --- /dev/null +++ b/arch/x86/pci/intel_bus.c @@ -0,0 +1,90 @@ +/* + * to read io range from IOH pci conf, need to do it after mmconfig is there + */ + +#include +#include +#include +#include +#include + +#include "bus_numa.h" + +static inline void print_ioh_resources(struct pci_root_info *info) +{ + int res_num; + int busnum; + int i; + + printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n", + info->bus_min, info->bus_max); + res_num = info->res_num; + busnum = info->bus_min; + for (i = 0; i < res_num; i++) { + struct resource *res; + + res = &info->res[i]; + printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n", + busnum, i, + (res->flags & IORESOURCE_IO) ? "io port" : + "mmio", + res->start, res->end); + } +} + +#define IOH_LIO 0x108 +#define IOH_LMMIOL 0x10c +#define IOH_LMMIOH 0x110 +#define IOH_LMMIOH_BASEU 0x114 +#define IOH_LMMIOH_LIMITU 0x118 +#define IOH_LCFGBUS 0x11c + +static void __devinit pci_root_bus_res(struct pci_dev *dev) +{ + u16 word; + u32 dword; + struct pci_root_info *info; + u16 io_base, io_end; + u32 mmiol_base, mmiol_end; + u64 mmioh_base, mmioh_end; + int bus_base, bus_end; + + if (pci_root_num >= PCI_ROOT_NR) { + printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n"); + return; + } + + info = &pci_root_info[pci_root_num]; + pci_root_num++; + + pci_read_config_word(dev, IOH_LCFGBUS, &word); + bus_base = (word & 0xff); + bus_end = (word & 0xff00) >> 8; + sprintf(info->name, "PCI Bus #%02x", bus_base); + info->bus_min = bus_base; + info->bus_max = bus_end; + + pci_read_config_word(dev, IOH_LIO, &word); + io_base = (word & 0xf0) << (12 - 4); + io_end = (word & 0xf000) | 0xfff; + update_res(info, io_base, io_end, IORESOURCE_IO, 0); + + pci_read_config_dword(dev, IOH_LMMIOL, &dword); + mmiol_base = (dword & 0xff00) << (24 - 8); + mmiol_end = (dword & 0xff000000) | 0xffffff; + update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0); + + pci_read_config_dword(dev, IOH_LMMIOH, &dword); + mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10); + mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff); + pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword); + mmioh_base |= ((u64)(dword & 0x7ffff)) << 32; + pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword); + mmioh_end |= ((u64)(dword & 0x7ffff)) << 32; + update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0); + + print_ioh_resources(info); +} + +/* intel IOH */ +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res); -- cgit v1.2.2 From ac1aa47b131416a6ff37eb1005a0a1d2541aad6c Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Mon, 26 Oct 2009 13:20:44 -0700 Subject: PCI: determine CLS more intelligently Till now, CLS has been determined either by arch code or as L1_CACHE_BYTES. Only x86 and ia64 set CLS explicitly and x86 doesn't always get it right. On most configurations, the chance is that firmware configures the correct value during boot. This patch makes pci_init() determine CLS by looking at what firmware has configured. It scans all devices and if all non-zero values agree, the value is used. If none is configured or there is a disagreement, pci_dfl_cache_line_size is used. arch can set the dfl value (via PCI_CACHE_LINE_BYTES or pci_dfl_cache_line_size) or override the actual one. ia64, x86 and sparc64 updated to set the default cls instead of the actual one. While at it, declare pci_cache_line_size and pci_dfl_cache_line_size in pci.h and drop private declarations from arch code. Signed-off-by: Tejun Heo Acked-by: David Miller Acked-by: Greg KH Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Jesse Barnes --- arch/x86/pci/common.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 1331fcf26143..fbeec31316cf 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -410,8 +410,6 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) return bus; } -extern u8 pci_cache_line_size; - int __init pcibios_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -426,11 +424,11 @@ int __init pcibios_init(void) * and P4. It's also good for 386/486s (which actually have 16) * as quite a few PCI devices do not support smaller values. */ - pci_cache_line_size = 32 >> 2; + pci_dfl_cache_line_size = 32 >> 2; if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) - pci_cache_line_size = 64 >> 2; /* K7 & K8 */ + pci_dfl_cache_line_size = 64 >> 2; /* K7 & K8 */ else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) - pci_cache_line_size = 128 >> 2; /* P4 */ + pci_dfl_cache_line_size = 128 >> 2; /* P4 */ pcibios_resource_survey(); -- cgit v1.2.2 From 76b1a87b217927f905f4b01c586452b2a1d33913 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 14 Oct 2009 16:31:39 -0400 Subject: x86/PCI: Use generic cacheline sizing instead of per-vendor tests. Instead of the PCI code needing to have code to determine the cacheline size of each processor, use the data the cpu identification code should have already determined during early boot. (The vendor checks are also incomplete, and don't take into account modern CPUs) I've been carrying a variant of this code in Fedora for a while, that prints debug information. There are a number of cases where we are currently setting the PCI cacheline size to 32 bytes, when the CPU cacheline size is 64 bytes. With this patch, we set them both the same. Signed-off-by: Dave Jones Signed-off-by: Jesse Barnes --- arch/x86/pci/common.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index fbeec31316cf..d2552c68e94d 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -420,15 +420,19 @@ int __init pcibios_init(void) } /* - * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 - * and P4. It's also good for 386/486s (which actually have 16) + * Set PCI cacheline size to that of the CPU if the CPU has reported it. + * (For older CPUs that don't support cpuid, we se it to 32 bytes + * It's also good for 386/486s (which actually have 16) * as quite a few PCI devices do not support smaller values. */ - pci_dfl_cache_line_size = 32 >> 2; - if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) - pci_dfl_cache_line_size = 64 >> 2; /* K7 & K8 */ - else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) - pci_dfl_cache_line_size = 128 >> 2; /* P4 */ + if (c->x86_clflush_size > 0) { + pci_dfl_cache_line_size = c->x86_clflush_size >> 2; + printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n", + pci_dfl_cache_line_size << 2); + } else { + pci_dfl_cache_line_size = 32 >> 2; + printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n"); + } pcibios_resource_survey(); -- cgit v1.2.2 From 42887b29ced263ec3b8bd26ef157a324789b89d9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 6 Oct 2009 15:33:49 -0600 Subject: x86/PCI: print resources consistently with %pRt This uses %pRt to print additional resource information (type, size, prefetchability, etc.) consistently. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 14 +++++++++++--- arch/x86/pci/i386.c | 12 +++++------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 1014eb4bfc37..9b3daf976732 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -7,6 +7,7 @@ #include struct pci_root_info { + struct acpi_device *bridge; char *name; unsigned int res_num; struct resource *res; @@ -107,12 +108,18 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->child = NULL; if (insert_resource(root, res)) { - printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " - "from %s for %s\n", (unsigned long) res->start, - (unsigned long) res->end, root->name, info->name); + dev_err(&info->bridge->dev, "can't allocate %pRt\n", res); } else { info->bus->resource[info->res_num] = res; info->res_num++; + if (addr.translation_offset) + dev_info(&info->bridge->dev, "host bridge window: %pRt " + "(PCI address [%#llx-%#llx])\n", + res, res->start - addr.translation_offset, + res->end - addr.translation_offset); + else + dev_info(&info->bridge->dev, + "host bridge window: %pRt\n", res); } return AE_OK; } @@ -124,6 +131,7 @@ get_current_resources(struct acpi_device *device, int busnum, struct pci_root_info info; size_t size; + info.bridge = device; info.bus = bus; info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index b22d13b0c71d..a70a85de5e84 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -129,7 +129,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); + dev_info(&dev->dev, "BAR %d: can't allocate %pRt\n", idx, r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -164,12 +164,10 @@ static void __init pcibios_allocate_resources(int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", - (unsigned long long) r->start, - (unsigned long long) r->end, - r->flags, disabled, pass); + dev_dbg(&dev->dev, "%pRf (d=%d, p=%d)\n", r, + disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); + dev_info(&dev->dev, "BAR %d: can't allocate %pRt\n", idx, r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; @@ -182,7 +180,7 @@ static void __init pcibios_allocate_resources(int pass) /* Turn the ROM off, leave the resource region, * but keep it unregistered. */ u32 reg; - dev_dbg(&dev->dev, "disabling ROM\n"); + dev_dbg(&dev->dev, "disabling ROM %pRt\n", r); r->flags &= ~IORESOURCE_ROM_ENABLE; pci_read_config_dword(dev, dev->rom_base_reg, ®); -- cgit v1.2.2 From 2992e545ea006992ec9dc91c4fa996ce1e15f921 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 13:21:32 -0800 Subject: x86/PCI/PAT: return EINVAL for pci mmap WC request for !pat_enabled Thomas Schlichter reported: > X.org uses libpciaccess which tries to mmap with write combining enabled via > /sys/bus/pci/devices/*/resource0_wc. Currently, when PAT is not enabled, the > kernel does fall back to uncached mmap. Then libpciaccess thinks it succeeded > mapping with write combining enabled and does not set up suited MTRR entries. > ;-( Instead of silently mapping pci mmap region as UC minus in the case of !pat_enabled and wc request, we can return error. Eric Anholt mentioned that caller (like X) typically follows up with UC minus pci mmap request and if there is a free mtrr slot, caller will manage adding WC mtrr. Jesse Barnes says: > Older versions of libpciaccess will behave better if we do it that way > (iirc it only allocates an MTRR if the resource_wc file doesn't exist or > fails to get mapped). Reported-by: Thomas Schlichter Signed-off-by: Thomas Schlichter Signed-off-by: Suresh Siddha Acked-by: Eric Anholt Acked-by: Jesse Barnes Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index a70a85de5e84..52e656f17781 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -280,6 +280,15 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return -EINVAL; prot = pgprot_val(vma->vm_page_prot); + + /* + * Return error if pat is not enabled and write_combine is requested. + * Caller can followup with UC MINUS request and add a WC mtrr if there + * is a free mtrr slot. + */ + if (!pat_enabled && write_combine) + return -EINVAL; + if (pat_enabled && write_combine) prot |= _PAGE_CACHE_WC; else if (pat_enabled || boot_cpu_data.x86 > 3) -- cgit v1.2.2 From 9a08f7d3506019e3833cd4394ca0d7da0ae3689f Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 23 Oct 2009 15:20:33 -0600 Subject: x86/PCI: allow MMCONFIG above 4GB The current whitelist requires a kernel change for every machine that has MMCONFIG regions above 4GB, even if BIOS provides a correct MCFG table. This patch expands the whitelist to include machines with a rev 1 or newer MCFG table and a DMI_BIOS_DATE of 2010 or later. That way, we only need kernel changes for new machines that provide incorrect MCFG tables. Signed-off-by: Bjorn Helgaas CC: Matthew Wilcox CC: John Keller CC: Yinghai Lu CC: Kenji Kaneshige CC: Andi Kleen Acked-by: Ingo Molnar Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 602c172d3bd5..02642773c29d 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -527,18 +528,31 @@ reject: static int __initdata known_bridge; -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; - /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; -static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) +static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, + struct acpi_mcfg_allocation *cfg) { + int year; + + if (cfg->address < 0xFFFFFFFF) + return 0; + if (!strcmp(mcfg->header.oem_id, "SGI")) - acpi_mcfg_64bit_base_addr = TRUE; + return 0; - return 0; + if (mcfg->header.revision >= 1) { + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && + year >= 2010) + return 0; + } + + printk(KERN_ERR PREFIX "MCFG region for %04x:%02x-%02x at %#llx " + "is above 4GB, ignored\n", cfg->pci_segment, + cfg->start_bus_number, cfg->end_bus_number, cfg->address); + return -EINVAL; } static int __init pci_parse_mcfg(struct acpi_table_header *header) @@ -574,13 +588,8 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) memcpy(pci_mmcfg_config, &mcfg[1], config_size); - acpi_mcfg_oem_check(mcfg); - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && - !acpi_mcfg_64bit_base_addr) { - printk(KERN_ERR PREFIX - "MMCONFIG not in low 4GB of memory\n"); + if (acpi_mcfg_check_entry(mcfg, &pci_mmcfg_config[i])) { kfree(pci_mmcfg_config); pci_mmcfg_config_num = 0; return -ENODEV; -- cgit v1.2.2 From 1ccbf5344c3daef046d2323190cc6807c44f1917 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 6 Oct 2009 15:11:14 -0700 Subject: xen: move Xen-testing predicates to common header Move xen_domain and related tests out of asm-x86 to xen/xen.h so they can be included whenever they are necessary. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Jesse Barnes --- arch/x86/include/asm/xen/hypervisor.h | 27 --------------------------- arch/x86/xen/enlighten.c | 1 + 2 files changed, 1 insertion(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index d5b7e90c0edf..396ff4cc8ed4 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -37,31 +37,4 @@ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -enum xen_domain_type { - XEN_NATIVE, /* running on bare hardware */ - XEN_PV_DOMAIN, /* running in a PV domain */ - XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ -}; - -#ifdef CONFIG_XEN -extern enum xen_domain_type xen_domain_type; -#else -#define xen_domain_type XEN_NATIVE -#endif - -#define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain() && \ - xen_domain_type == XEN_PV_DOMAIN) -#define xen_hvm_domain() (xen_domain() && \ - xen_domain_type == XEN_HVM_DOMAIN) - -#ifdef CONFIG_XEN_DOM0 -#include - -#define xen_initial_domain() (xen_pv_domain() && \ - xen_start_info->flags & SIF_INITDOMAIN) -#else /* !CONFIG_XEN_DOM0 */ -#define xen_initial_domain() (0) -#endif /* CONFIG_XEN_DOM0 */ - #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 23a4d80fb39e..5bccd706232c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include -- cgit v1.2.2 From af5a8ee05404112f38fb2904747c688bdc31a746 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 14 Oct 2009 10:27:42 -0600 Subject: x86/PCI: use -DDEBUG when CONFIG_PCI_DEBUG set We use dev_dbg() in arch/x86/pci, but there's no easy way to turn it on. Add -DDEBUG when CONFIG_PCI_DEBUG=y, just like we do in drivers/pci. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/Makefile | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 56d917b556c6..d8a0a6279a4d 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -16,3 +16,7 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o obj-y += amd_bus.o obj-$(CONFIG_X86_64) += intel_bus.o + +ifeq ($(CONFIG_PCI_DEBUG),y) +EXTRA_CFLAGS += -DDEBUG +endif -- cgit v1.2.2 From c7dabef8a2c59e6a3de9d66fc35fb6a43ef7172d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Oct 2009 13:26:47 -0600 Subject: vsprintf: use %pR, %pr instead of %pRt, %pRf Jesse accidentally applied v1 [1] of the patchset instead of v2 [2]. This is the diff between v1 and v2. The changes in this patch are: - tidied vsprintf stack buffer to shrink and compute size more accurately - use %pR for decoding and %pr for "raw" (with type and flags) instead of adding %pRt and %pRf [1] http://lkml.org/lkml/2009/10/6/491 [2] http://lkml.org/lkml/2009/10/13/441 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 7 ++++--- arch/x86/pci/i386.c | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 9b3daf976732..6bf8091d2fd5 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -108,18 +108,19 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->child = NULL; if (insert_resource(root, res)) { - dev_err(&info->bridge->dev, "can't allocate %pRt\n", res); + dev_err(&info->bridge->dev, + "can't allocate host bridge window %pR\n", res); } else { info->bus->resource[info->res_num] = res; info->res_num++; if (addr.translation_offset) - dev_info(&info->bridge->dev, "host bridge window: %pRt " + dev_info(&info->bridge->dev, "host bridge window %pR " "(PCI address [%#llx-%#llx])\n", res, res->start - addr.translation_offset, res->end - addr.translation_offset); else dev_info(&info->bridge->dev, - "host bridge window: %pRt\n", res); + "host bridge window %pR\n", res); } return AE_OK; } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 52e656f17781..d49d17de7b3f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -129,7 +129,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate %pRt\n", idx, r); + dev_info(&dev->dev, "BAR %d: can't allocate %pR\n", idx, r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -164,10 +164,11 @@ static void __init pcibios_allocate_resources(int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - dev_dbg(&dev->dev, "%pRf (d=%d, p=%d)\n", r, - disabled, pass); + dev_dbg(&dev->dev, + "BAR %d: claiming %pr (d=%d, p=%d)\n", + idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate %pRt\n", idx, r); + dev_info(&dev->dev, "BAR %d: can't claim %pR\n", idx, r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; @@ -180,7 +181,7 @@ static void __init pcibios_allocate_resources(int pass) /* Turn the ROM off, leave the resource region, * but keep it unregistered. */ u32 reg; - dev_dbg(&dev->dev, "disabling ROM %pRt\n", r); + dev_dbg(&dev->dev, "disabling ROM %pR\n", r); r->flags &= ~IORESOURCE_ROM_ENABLE; pci_read_config_dword(dev, dev->rom_base_reg, ®); -- cgit v1.2.2 From 2a6bed8301f8b019717504575a3f9c6cce1fe271 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 4 Nov 2009 10:32:47 -0700 Subject: x86/PCI: print domain:bus in conventional format Use the dev_printk-like "%04x:%02x" format for printing PCI bus numbers. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 6bf8091d2fd5..68b89dc7d761 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -172,8 +172,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do #endif if (domain && !pci_domains_supported) { - printk(KERN_WARNING "PCI: Multiple domains not supported " - "(dom %d, bus %d)\n", domain, busnum); + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (multiple domains not supported)\n", + domain, busnum); return NULL; } @@ -197,7 +198,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do */ sd = kzalloc(sizeof(*sd), GFP_KERNEL); if (!sd) { - printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (out of memory)\n", domain, busnum); return NULL; } -- cgit v1.2.2 From 865df576e8fc70daf297b53e61a4fbefc719d065 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 4 Nov 2009 10:32:57 -0700 Subject: PCI: improve discovery/configuration messages This makes PCI resource management messages more consistent and adds a few new messages to aid debugging. Whenever we assign resources to a device, update a BAR, or change a bridge aperture, it's worth noting it. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index d49d17de7b3f..b73c09f45210 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -129,7 +129,9 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate %pR\n", idx, r); + dev_info(&dev->dev, + "can't reserve window %pR\n", + r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -165,10 +167,11 @@ static void __init pcibios_allocate_resources(int pass) disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { dev_dbg(&dev->dev, - "BAR %d: claiming %pr (d=%d, p=%d)\n", + "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't claim %pR\n", idx, r); + dev_info(&dev->dev, + "can't reserve %pR\n", r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; -- cgit v1.2.2 From f1db6fde09e201218f488d7205a7cd7bc448d496 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 4 Nov 2009 10:39:13 -0700 Subject: x86/PCI: for debuggability, show host bridge windows even when ignoring _CRS We have occasional problems with PCI resource allocation, and sometimes they could be avoided by paying attention to what ACPI tells us about the host bridges. This patch doesn't change the behavior, but it prints window information that should make debugging easier. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 68b89dc7d761..54db5a04b5e1 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -92,11 +92,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data) start = addr.minimum + addr.translation_offset; end = start + addr.address_length - 1; if (info->res_num >= max_root_bus_resources) { - printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " - "from %s for %s due to _CRS returning more than " - "%d resource descriptors\n", (unsigned long) start, - (unsigned long) end, root->name, info->name, - max_root_bus_resources); + if (pci_probe & PCI_USE__CRS) + printk(KERN_WARNING "PCI: Failed to allocate " + "0x%lx-0x%lx from %s for %s due to _CRS " + "returning more than %d resource descriptors\n", + (unsigned long) start, (unsigned long) end, + root->name, info->name, max_root_bus_resources); return AE_OK; } @@ -107,6 +108,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->end = end; res->child = NULL; + if (!(pci_probe & PCI_USE__CRS)) { + dev_printk(KERN_DEBUG, &info->bridge->dev, + "host bridge window %pR (ignored)\n", res); + return AE_OK; + } + if (insert_resource(root, res)) { dev_err(&info->bridge->dev, "can't allocate host bridge window %pR\n", res); @@ -132,6 +139,11 @@ get_current_resources(struct acpi_device *device, int busnum, struct pci_root_info info; size_t size; + if (!(pci_probe & PCI_USE__CRS)) + dev_info(&device->dev, + "ignoring host bridge windows from ACPI; " + "boot with \"pci=use_crs\" to use them\n"); + info.bridge = device; info.bus = bus; info.res_num = 0; @@ -220,9 +232,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do } else { bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); if (bus) { - if (pci_probe & PCI_USE__CRS) - get_current_resources(device, busnum, domain, - bus); + get_current_resources(device, busnum, domain, bus); bus->subordinate = pci_scan_child_bus(bus); } } -- cgit v1.2.2 From 03db42adfeeabe856dbb6894dd3aaff55838330a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 4 Nov 2009 10:39:18 -0700 Subject: x86/PCI: fix bogus host bridge window start/end alignment from _CRS PCI device BARs are guaranteed to start and end on at least a four-byte (I/O) or a sixteen-byte (MMIO) boundary because they're aligned on their size and the low BAR bits are reserved. PCI-to-PCI bridge apertures have even larger alignment restrictions. However, some BIOSes (e.g., HP DL360 BIOS P31) report host bridge windows like "[io 0x0000-0x2cfe]". This is wrong because it excludes the last port at 0x2cff: it's impossible for a downstream device to claim 0x2cfe without also claiming 0x2cff. In fact, this BIOS configures a device behind the bridge to "[io 0x2c00-0x2cff]", so we know the window actually does include 0x2cff. This patch rounds the start and end of apertures to the appropriate boundary. I experimentally determined that Windows contains a similar workaround; details here: http://bugzilla.kernel.org/show_bug.cgi?id=14337 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 54db5a04b5e1..8ddf4f4c7253 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -59,6 +59,30 @@ bus_has_transparent_bridge(struct pci_bus *bus) return false; } +static void +align_resource(struct acpi_device *bridge, struct resource *res) +{ + int align = (res->flags & IORESOURCE_MEM) ? 16 : 4; + + /* + * Host bridge windows are not BARs, but the decoders on the PCI side + * that claim this address space have starting alignment and length + * constraints, so fix any obvious BIOS goofs. + */ + if (res->start & (align - 1)) { + dev_printk(KERN_DEBUG, &bridge->dev, + "host bridge window %pR invalid; " + "aligning start to %d-byte boundary\n", res, align); + res->start &= ~(align - 1); + } + if ((res->end + 1) & (align - 1)) { + dev_printk(KERN_DEBUG, &bridge->dev, + "host bridge window %pR invalid; " + "aligning end to %d-byte boundary\n", res, align); + res->end = roundup(res->end, align) - 1; + } +} + static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data) { @@ -107,6 +131,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->start = start; res->end = end; res->child = NULL; + align_resource(info->bridge, res); if (!(pci_probe & PCI_USE__CRS)) { dev_printk(KERN_DEBUG, &info->bridge->dev, -- cgit v1.2.2 From 2da3e160cb3d226d87b907fab26850d838ed8d7c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 5 Nov 2009 23:06:50 +0100 Subject: hw-breakpoint: Move asm-generic/hw_breakpoint.h to linux/hw_breakpoint.h We plan to make the breakpoints parameters generic among architectures. For that it's better to move the asm-generic header to a generic linux header. Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/hw_breakpoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 1acb4d45de70..3cfca8e2b5f6 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -12,7 +12,7 @@ struct arch_hw_breakpoint { }; #include -#include +#include /* Available HW breakpoint length encodings */ #define HW_BREAKPOINT_LEN_1 0x40 -- cgit v1.2.2 From c3359fbce4b65d542d02c30aa5174c8e4838da2d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Apr 2009 00:59:52 -0700 Subject: sysctl: x86 Use the compat_sys_sysctl Now that we have a generic 32bit compatibility implementation there is no need for x86 to implement it's own. Cc: Thomas Gleixner Cc: Ingo Molnar Acked-by: H. Peter Anvin Signed-off-by: Eric W. Biederman --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/ia32/sys_ia32.c | 56 ----------------------------------------- arch/x86/include/asm/sys_ia32.h | 5 ---- 3 files changed, 1 insertion(+), 62 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 581b0568fe19..5d2584839be4 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -653,7 +653,7 @@ ia32_sys_call_table: .quad compat_sys_writev .quad sys_getsid .quad sys_fdatasync - .quad sys32_sysctl /* sysctl */ + .quad compat_sys_sysctl /* sysctl */ .quad sys_mlock /* 150 */ .quad sys_munlock .quad sys_mlockall diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 9f5527198825..df82c0e48ded 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -434,62 +434,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig, return ret; } -#ifdef CONFIG_SYSCTL_SYSCALL -struct sysctl_ia32 { - unsigned int name; - int nlen; - unsigned int oldval; - unsigned int oldlenp; - unsigned int newval; - unsigned int newlen; - unsigned int __unused[4]; -}; - - -asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32) -{ - struct sysctl_ia32 a32; - mm_segment_t old_fs = get_fs(); - void __user *oldvalp, *newvalp; - size_t oldlen; - int __user *namep; - long ret; - - if (copy_from_user(&a32, args32, sizeof(a32))) - return -EFAULT; - - /* - * We need to pre-validate these because we have to disable - * address checking before calling do_sysctl() because of - * OLDLEN but we can't run the risk of the user specifying bad - * addresses here. Well, since we're dealing with 32 bit - * addresses, we KNOW that access_ok() will always succeed, so - * this is an expensive NOP, but so what... - */ - namep = compat_ptr(a32.name); - oldvalp = compat_ptr(a32.oldval); - newvalp = compat_ptr(a32.newval); - - if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp))) - || !access_ok(VERIFY_WRITE, namep, 0) - || !access_ok(VERIFY_WRITE, oldvalp, 0) - || !access_ok(VERIFY_WRITE, newvalp, 0)) - return -EFAULT; - - set_fs(KERNEL_DS); - lock_kernel(); - ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen, - newvalp, (size_t) a32.newlen); - unlock_kernel(); - set_fs(old_fs); - - if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp))) - return -EFAULT; - - return ret; -} -#endif - /* warning: next two assume little endian */ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 72a6dcd1299b..9af9decb38c3 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -51,11 +51,6 @@ asmlinkage long sys32_sched_rr_get_interval(compat_pid_t, asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); -#ifdef CONFIG_SYSCTL_SYSCALL -struct sysctl_ia32; -asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *); -#endif - asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); -- cgit v1.2.2 From ea7f1b6ee9dc96c5827b06ba21d7769d553efb7d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 5 Nov 2009 11:17:11 -0600 Subject: x86/PCI: remove 64-bit division The roundup() caused a build error (undefined reference to `__udivdi3'). We're aligning to power-of-two boundaries, so it's simpler to just use ALIGN() anyway, which avoids the division. Signed-off-by: Bjorn Helgaas Acked-by: Randy Dunlap Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 8ddf4f4c7253..959e548a7039 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -69,17 +69,17 @@ align_resource(struct acpi_device *bridge, struct resource *res) * that claim this address space have starting alignment and length * constraints, so fix any obvious BIOS goofs. */ - if (res->start & (align - 1)) { + if (!IS_ALIGNED(res->start, align)) { dev_printk(KERN_DEBUG, &bridge->dev, "host bridge window %pR invalid; " "aligning start to %d-byte boundary\n", res, align); res->start &= ~(align - 1); } - if ((res->end + 1) & (align - 1)) { + if (!IS_ALIGNED(res->end + 1, align)) { dev_printk(KERN_DEBUG, &bridge->dev, "host bridge window %pR invalid; " "aligning end to %d-byte boundary\n", res, align); - res->end = roundup(res->end, align) - 1; + res->end = ALIGN(res->end, align) - 1; } } -- cgit v1.2.2 From c12a229bc5971534537a7d0e49e44f9f1f5d0336 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 5 Nov 2009 11:03:59 -0500 Subject: x86: Remove unused thread_return label from switch_to() Remove unused thread_return label from switch_to() macro on x86-64. Since this symbol cuts into schedule(), backtrace at the latter half of schedule() was always shown as thread_return(). Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE LKML-Reference: <20091105160359.5181.26225.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/system.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index f08f97374892..1a953e26401c 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -128,8 +128,6 @@ do { \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ "call __switch_to\n\t" \ - ".globl thread_return\n" \ - "thread_return:\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ __switch_canary \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ -- cgit v1.2.2 From 0d0fbbddcc27c062815732b38c44b544e656c799 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 5 Nov 2009 22:45:41 +1030 Subject: x86, msr, cpumask: Use struct cpumask rather than the deprecated cpumask_t This makes the declarations match the definitions, which already use 'struct cpumask'. Signed-off-by: Rusty Russell Acked-by: Borislav Petkov LKML-Reference: <200911052245.41803.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 9a00219b331a..5bef931f8b14 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -264,12 +264,12 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) wrmsr(msr_no, l, h); return 0; } -static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no, +static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); } -static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no, +static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); -- cgit v1.2.2 From 0420101c075530c65ba00b6fe7291b126fbfc5d2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 28 Oct 2009 16:09:55 -0700 Subject: x86: k8.h: Add struct bootnode k8.h uses struct bootnode but does not #include a header file for it, so provide a simple declaration for it. arch/x86/include/asm/k8.h:13: warning: 'struct bootnode' declared inside parameter list arch/x86/include/asm/k8.h:13: warning: its scope is only this definition or declaration, which is probably not what you want Signed-off-by: Randy Dunlap Acked-by: David Rientjes Cc: Stephen Rothwell LKML-Reference: <20091028160955.d27ccb16.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/k8.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index c092f720bd60..f70e60071fe8 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -4,6 +4,7 @@ #include extern struct pci_device_id k8_nb_ids[]; +struct bootnode; extern int early_is_k8_nb(u32 value); extern struct pci_dev **k8_northbridges; -- cgit v1.2.2 From 338bac527ed0e35b4cb50390972f15d3cbce92ca Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 27 Oct 2009 16:34:44 +0900 Subject: x86: Use x86_platform for iommu_shutdown This patch cleans up pci_iommu_shutdown() a bit to use x86_platform (similar to how IA64 initializes an IOMMU driver). This adds iommu_shutdown() to x86_platform to avoid calling every IOMMUs' shutdown functions in pci_iommu_shutdown() in order. The IOMMU shutdown functions are platform specific (we don't have multiple different IOMMU hardware) so the current way is pointless. An IOMMU driver sets x86_platform.iommu_shutdown to the shutdown function if necessary. Signed-off-by: FUJITA Tomonori Cc: joerg.roedel@amd.com LKML-Reference: <20091027163358F.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/amd_iommu.h | 2 -- arch/x86/include/asm/gart.h | 4 ---- arch/x86/include/asm/iommu.h | 2 +- arch/x86/include/asm/x86_init.h | 1 + arch/x86/kernel/amd_iommu_init.c | 6 +----- arch/x86/kernel/crash.c | 5 ++--- arch/x86/kernel/pci-dma.c | 7 ------- arch/x86/kernel/pci-gart_64.c | 6 ++++-- arch/x86/kernel/reboot.c | 4 ++-- arch/x86/kernel/x86_init.c | 2 ++ 10 files changed, 13 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 4b180897e6b5..3604669f7b15 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -30,12 +30,10 @@ extern void amd_iommu_detect(void); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); extern void amd_iommu_flush_all_domains(void); extern void amd_iommu_flush_all_devices(void); -extern void amd_iommu_shutdown(void); extern void amd_iommu_apply_erratum_63(u16 devid); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } -static inline void amd_iommu_shutdown(void) { } #endif #endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 6cfdafa409d8..4fdd5b3f87b1 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -36,7 +36,6 @@ extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); extern void gart_iommu_init(void); -extern void gart_iommu_shutdown(void); extern void __init gart_parse_options(char *); extern void gart_iommu_hole_init(void); @@ -51,9 +50,6 @@ static inline void early_gart_iommu_check(void) static inline void gart_iommu_init(void) { } -static inline void gart_iommu_shutdown(void) -{ -} static inline void gart_parse_options(char *options) { } diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index fd6d21bbee6c..878b30715766 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_IOMMU_H #define _ASM_X86_IOMMU_H -extern void pci_iommu_shutdown(void); +static inline void iommu_shutdown_noop(void) {} extern void no_iommu_init(void); extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2c756fd4ab0e..66008ed80b7a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -121,6 +121,7 @@ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); + void (*iommu_shutdown)(void); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c20001e4f556..6acd43e9afd7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1297,6 +1297,7 @@ int __init amd_iommu_init(void) else printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + x86_platform.iommu_shutdown = disable_iommus; out: return ret; @@ -1323,11 +1324,6 @@ free: goto out; } -void amd_iommu_shutdown(void) -{ - disable_iommus(); -} - /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 5e409dc298a4..a4849c10a77e 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,8 +27,7 @@ #include #include #include -#include - +#include #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif crash_save_cpu(regs, safe_smp_processor_id()); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b2a71dca5642..ce2fb91bbed1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -303,13 +303,6 @@ static int __init pci_iommu_init(void) no_iommu_init(); return 0; } - -void pci_iommu_shutdown(void) -{ - gart_iommu_shutdown(); - - amd_iommu_shutdown(); -} /* Must execute after PCI subsystem */ rootfs_initcall(pci_iommu_init); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a7f1b64f86e0..a9bcdf7c8801 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -39,6 +39,7 @@ #include #include #include +#include static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -688,12 +689,12 @@ static struct dma_map_ops gart_dma_ops = { .free_coherent = gart_free_coherent, }; -void gart_iommu_shutdown(void) +static void gart_iommu_shutdown(void) { struct pci_dev *dev; int i; - if (no_agp && (dma_ops != &gart_dma_ops)) + if (no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { @@ -838,6 +839,7 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; } void __init gart_parse_options(char *p) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f93078746e00..2b97fc5b124e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -23,7 +23,7 @@ # include # include #else -# include +# include #endif /* @@ -622,7 +622,7 @@ void native_machine_shutdown(void) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4449a4a2c2ed..bc9b230ef402 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,6 +14,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -72,4 +73,5 @@ struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, + .iommu_shutdown = iommu_shutdown_noop, }; -- cgit v1.2.2 From 2ae8bb75db1f3de422eb5898f2a063c46c36dba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Oct 2009 15:41:46 +0100 Subject: x86: Fix iommu=nodac parameter handling iommu=nodac should forbid dac instead of enabling it. Fix it. Signed-off-by: Tejun Heo Acked-by: FUJITA Tomonori Cc: Matteo Frigo Cc: # .32.x and older LKML-Reference: <4AE5B52A.4050408@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index ce2fb91bbed1..839d49a669bc 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -216,7 +216,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; -- cgit v1.2.2 From 24f1e32c60c45c89a997c73395b69c8af6f0a84e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Sep 2009 19:22:48 +0200 Subject: hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events This patch rebase the implementation of the breakpoints API on top of perf events instances. Each breakpoints are now perf events that handle the register scheduling, thread/cpu attachment, etc.. The new layering is now made as follows: ptrace kgdb ftrace perf syscall \ | / / \ | / / / Core breakpoint API / / | / | / Breakpoints perf events | | Breakpoints PMU ---- Debug Register constraints handling (Part of core breakpoint API) | | Hardware debug registers Reasons of this rewrite: - Use the centralized/optimized pmu registers scheduling, implying an easier arch integration - More powerful register handling: perf attributes (pinned/flexible events, exclusive/non-exclusive, tunable period, etc...) Impact: - New perf ABI: the hardware breakpoints counters - Ptrace breakpoints setting remains tricky and still needs some per thread breakpoints references. Todo (in the order): - Support breakpoints perf counter events for perf tools (ie: implement perf_bpcounter_event()) - Support from perf tools Changes in v2: - Follow the perf "event " rename - The ptrace regression have been fixed (ptrace breakpoint perf events weren't released when a task ended) - Drop the struct hw_breakpoint and store generic fields in perf_event_attr. - Separate core and arch specific headers, drop asm-generic/hw_breakpoint.h and create linux/hw_breakpoint.h - Use new generic len/type for breakpoint - Handle off case: when breakpoints api is not supported by an arch Changes in v3: - Fix broken CONFIG_KVM, we need to propagate the breakpoint api changes to kvm when we exit the guest and restore the bp registers to the host. Changes in v4: - Drop the hw_breakpoint_restore() stub as it is only used by KVM - EXPORT_SYMBOL_GPL hw_breakpoint_restore() as KVM can be built as a module - Restore the breakpoints unconditionally on kvm guest exit: TIF_DEBUG_THREAD doesn't anymore cover every cases of running breakpoints and vcpu->arch.switch_db_regs might not always be set when the guest used debug registers. (Waiting for a reliable optimization) Changes in v5: - Split-up the asm-generic/hw-breakpoint.h moving to linux/hw_breakpoint.h into a separate patch - Optimize the breakpoints restoring while switching from kvm guest to host. We only want to restore the state if we have active breakpoints to the host, otherwise we don't care about messed-up address registers. - Add asm/hw_breakpoint.h to Kbuild - Fix bad breakpoint type in trace_selftest.c Changes in v6: - Fix wrong header inclusion in trace.h (triggered a build error with CONFIG_FTRACE_SELFTEST Signed-off-by: Frederic Weisbecker Cc: Prasad Cc: Alan Stern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Ingo Molnar Cc: Jan Kiszka Cc: Jiri Slaby Cc: Li Zefan Cc: Avi Kivity Cc: Paul Mackerras Cc: Mike Galbraith Cc: Masami Hiramatsu Cc: Paul Mundt --- arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/debugreg.h | 11 +- arch/x86/include/asm/hw_breakpoint.h | 58 ++++-- arch/x86/include/asm/processor.h | 12 +- arch/x86/kernel/hw_breakpoint.c | 391 +++++++++++++++++++++++------------ arch/x86/kernel/process.c | 7 +- arch/x86/kernel/process_32.c | 26 +-- arch/x86/kernel/process_64.c | 26 +-- arch/x86/kernel/ptrace.c | 182 +++++++++++----- arch/x86/kernel/smpboot.c | 3 - arch/x86/kvm/x86.c | 18 +- arch/x86/power/cpu.c | 6 - 12 files changed, 445 insertions(+), 296 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4a8e80cdcfa5..9f828f87ca35 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,6 +10,7 @@ header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h +header-y += hw_breakpoint.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 23439fbb1d0e..9a3333c91f9a 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -75,13 +75,8 @@ */ #ifdef __KERNEL__ -/* For process management */ -extern void flush_thread_hw_breakpoint(struct task_struct *tsk); -extern int copy_thread_hw_breakpoint(struct task_struct *tsk, - struct task_struct *child, unsigned long clone_flags); +DECLARE_PER_CPU(unsigned long, dr7); -/* For CPU management */ -extern void load_debug_registers(void); static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ @@ -94,6 +89,10 @@ static inline void hw_breakpoint_disable(void) set_debugreg(0UL, 3); } +#ifdef CONFIG_KVM +extern void hw_breakpoint_restore(void); +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 3cfca8e2b5f6..0675a7c4c20e 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -4,6 +4,11 @@ #ifdef __KERNEL__ #define __ARCH_HW_BREAKPOINT_H +/* + * The name should probably be something dealt in + * a higher level. While dealing with the user + * (display/resolving) + */ struct arch_hw_breakpoint { char *name; /* Contains name of the symbol to set bkpt */ unsigned long address; @@ -12,44 +17,57 @@ struct arch_hw_breakpoint { }; #include -#include +#include +#include /* Available HW breakpoint length encodings */ -#define HW_BREAKPOINT_LEN_1 0x40 -#define HW_BREAKPOINT_LEN_2 0x44 -#define HW_BREAKPOINT_LEN_4 0x4c -#define HW_BREAKPOINT_LEN_EXECUTE 0x40 +#define X86_BREAKPOINT_LEN_1 0x40 +#define X86_BREAKPOINT_LEN_2 0x44 +#define X86_BREAKPOINT_LEN_4 0x4c +#define X86_BREAKPOINT_LEN_EXECUTE 0x40 #ifdef CONFIG_X86_64 -#define HW_BREAKPOINT_LEN_8 0x48 +#define X86_BREAKPOINT_LEN_8 0x48 #endif /* Available HW breakpoint type encodings */ /* trigger on instruction execute */ -#define HW_BREAKPOINT_EXECUTE 0x80 +#define X86_BREAKPOINT_EXECUTE 0x80 /* trigger on memory write */ -#define HW_BREAKPOINT_WRITE 0x81 +#define X86_BREAKPOINT_WRITE 0x81 /* trigger on memory read or write */ -#define HW_BREAKPOINT_RW 0x83 +#define X86_BREAKPOINT_RW 0x83 /* Total number of available HW breakpoint registers */ #define HBP_NUM 4 -extern struct hw_breakpoint *hbp_kernel[HBP_NUM]; -DECLARE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]); -extern unsigned int hbp_user_refcount[HBP_NUM]; +struct perf_event; +struct pmu; -extern void arch_install_thread_hw_breakpoint(struct task_struct *tsk); -extern void arch_uninstall_thread_hw_breakpoint(void); extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len); -extern int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, - struct task_struct *tsk); -extern void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk); -extern void arch_flush_thread_hw_breakpoint(struct task_struct *tsk); -extern void arch_update_kernel_hw_breakpoint(void *); +extern int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, - unsigned long val, void *data); + unsigned long val, void *data); + + +int arch_install_hw_breakpoint(struct perf_event *bp); +void arch_uninstall_hw_breakpoint(struct perf_event *bp); +void hw_breakpoint_pmu_read(struct perf_event *bp); +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp); + +extern void +arch_fill_perf_breakpoint(struct perf_event *bp); + +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type); +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type); + +extern int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type); + +extern struct pmu perf_ops_bp; + #endif /* __KERNEL__ */ #endif /* _I386_HW_BREAKPOINT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 61aafb71c7ef..820f3000f736 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -423,6 +423,8 @@ extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; +struct perf_event; + struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -444,12 +446,10 @@ struct thread_struct { unsigned long fs; #endif unsigned long gs; - /* Hardware debugging registers: */ - unsigned long debugreg[HBP_NUM]; - unsigned long debugreg6; - unsigned long debugreg7; - /* Hardware breakpoint info */ - struct hw_breakpoint *hbp[HBP_NUM]; + /* Save middle states of ptrace breakpoints */ + struct perf_event *ptrace_bps[HBP_NUM]; + /* Debug status used for traps, single steps, etc... */ + unsigned long debugreg6; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 9316a9de4de3..e622620790bd 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ * * Copyright (C) 2007 Alan Stern * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker */ /* @@ -22,6 +23,8 @@ * using the CPU's debug registers. */ +#include +#include #include #include #include @@ -38,26 +41,24 @@ #include #include -/* Unmasked kernel DR7 value */ -static unsigned long kdr7; +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); /* - * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register. - * Used to clear and verify the status of bits corresponding to DR0 - DR3 + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus */ -static const unsigned long dr7_masks[HBP_NUM] = { - 0x000f0003, /* LEN0, R/W0, G0, L0 */ - 0x00f0000c, /* LEN1, R/W1, G1, L1 */ - 0x0f000030, /* LEN2, R/W2, G2, L2 */ - 0xf00000c0 /* LEN3, R/W3, G3, L3 */ -}; +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); /* * Encode the length, type, Exact, and Enable bits for a particular breakpoint * as stored in debug register 7. */ -static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) { unsigned long bp_info; @@ -68,64 +69,89 @@ static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) return bp_info; } -void arch_update_kernel_hw_breakpoint(void *unused) +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) { - struct hw_breakpoint *bp; - int i, cpu = get_cpu(); - unsigned long temp_kdr7 = 0; - - /* Don't allow debug exceptions while we update the registers */ - set_debugreg(0UL, 7); + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); - for (i = hbp_kernel_pos; i < HBP_NUM; i++) { - per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i]; - if (bp) { - temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type); - set_debugreg(bp->info.address, i); - } - } + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; - /* No need to set DR6. Update the debug registers with kernel-space - * breakpoint values from kdr7 and user-space requests from the - * current process - */ - kdr7 = temp_kdr7; - set_debugreg(kdr7 | current->thread.debugreg7, 7); - put_cpu(); + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } /* - * Install the thread breakpoints in their debug registers. + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. */ -void arch_install_thread_hw_breakpoint(struct task_struct *tsk) +int arch_install_hw_breakpoint(struct perf_event *bp) { - struct thread_struct *thread = &(tsk->thread); - - switch (hbp_kernel_pos) { - case 4: - set_debugreg(thread->debugreg[3], 3); - case 3: - set_debugreg(thread->debugreg[2], 2); - case 2: - set_debugreg(thread->debugreg[1], 1); - case 1: - set_debugreg(thread->debugreg[0], 0); - default: - break; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } } - /* No need to set DR6 */ - set_debugreg((kdr7 | thread->debugreg7), 7); + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __get_cpu_var(cpu_debugreg[i]) = info->address; + + dr7 = &__get_cpu_var(dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); + + return 0; } /* - * Install the debug register values for just the kernel, no thread. + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. */ -void arch_uninstall_thread_hw_breakpoint(void) +void arch_uninstall_hw_breakpoint(struct perf_event *bp) { - /* Clear the user-space portion of debugreg7 by setting only kdr7 */ - set_debugreg(kdr7, 7); + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + dr7 = &__get_cpu_var(dr7); + *dr7 &= ~encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); } static int get_hbp_len(u8 hbp_len) @@ -133,17 +159,17 @@ static int get_hbp_len(u8 hbp_len) unsigned int len_in_bytes = 0; switch (hbp_len) { - case HW_BREAKPOINT_LEN_1: + case X86_BREAKPOINT_LEN_1: len_in_bytes = 1; break; - case HW_BREAKPOINT_LEN_2: + case X86_BREAKPOINT_LEN_2: len_in_bytes = 2; break; - case HW_BREAKPOINT_LEN_4: + case X86_BREAKPOINT_LEN_4: len_in_bytes = 4; break; #ifdef CONFIG_X86_64 - case HW_BREAKPOINT_LEN_8: + case X86_BREAKPOINT_LEN_8: len_in_bytes = 8; break; #endif @@ -178,67 +204,146 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) /* * Store a breakpoint's encoded address, length, and type. */ -static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk) +static int arch_store_info(struct perf_event *bp) { - /* - * User-space requests will always have the address field populated - * Symbol names from user-space are rejected - */ - if (tsk && bp->info.name) - return -EINVAL; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); /* * For kernel-addresses, either the address or symbol name can be * specified. */ - if (bp->info.name) - bp->info.address = (unsigned long) - kallsyms_lookup_name(bp->info.name); - if (bp->info.address) + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); + if (info->address) return 0; + return -EINVAL; } -/* - * Validate the arch-specific HW Breakpoint register settings - */ -int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, - struct task_struct *tsk) +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) { - unsigned int align; - int ret = -EINVAL; + /* Len */ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case X86_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case X86_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } - switch (bp->info.type) { - /* - * Ptrace-refactoring code - * For now, we'll allow instruction breakpoint only for user-space - * addresses - */ - case HW_BREAKPOINT_EXECUTE: - if ((!arch_check_va_in_userspace(bp->info.address, - bp->info.len)) && - bp->info.len != HW_BREAKPOINT_LEN_EXECUTE) - return ret; + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + *gen_type = HW_BREAKPOINT_X; break; - case HW_BREAKPOINT_WRITE: + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; break; - case HW_BREAKPOINT_RW: + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; break; default: - return ret; + return -EINVAL; } - switch (bp->info.len) { + return 0; +} + + +static int arch_build_bp_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + info->address = bp->attr.bp_addr; + + /* Len */ + switch (bp->attr.bp_len) { case HW_BREAKPOINT_LEN_1: - align = 0; + info->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: - align = 1; + info->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: - align = 3; + info->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: + info->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + break; + default: + return -EINVAL; + } + + return 0; +} +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp); + if (ret) + return ret; + + ret = -EINVAL; + + if (info->type == X86_BREAKPOINT_EXECUTE) + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + if ((!arch_check_va_in_userspace(info->address, info->len)) && + info->len != X86_BREAKPOINT_EXECUTE) + return ret; + + switch (info->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: align = 7; break; #endif @@ -246,8 +351,8 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, return ret; } - if (bp->triggered) - ret = arch_store_info(bp, tsk); + if (bp->callback) + ret = arch_store_info(bp); if (ret < 0) return ret; @@ -255,44 +360,47 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ - if (bp->info.address & align) + if (info->address & align) return -EINVAL; /* Check that the virtual address is in the proper range */ if (tsk) { - if (!arch_check_va_in_userspace(bp->info.address, bp->info.len)) + if (!arch_check_va_in_userspace(info->address, info->len)) return -EFAULT; } else { - if (!arch_check_va_in_kernelspace(bp->info.address, - bp->info.len)) + if (!arch_check_va_in_kernelspace(info->address, info->len)) return -EFAULT; } + return 0; } -void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk) +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { - struct thread_struct *thread = &(tsk->thread); - struct hw_breakpoint *bp = thread->hbp[pos]; - - thread->debugreg7 &= ~dr7_masks[pos]; - if (bp) { - thread->debugreg[pos] = bp->info.address; - thread->debugreg7 |= encode_dr7(pos, bp->info.len, - bp->info.type); - } else - thread->debugreg[pos] = 0; + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } } -void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) +#ifdef CONFIG_KVM +void hw_breakpoint_restore(void) { - int i; - struct thread_struct *thread = &(tsk->thread); - - thread->debugreg7 = 0; - for (i = 0; i < HBP_NUM; i++) - thread->debugreg[i] = 0; + set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); + set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); + set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); + set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(__get_cpu_var(dr7), 7); } +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); +#endif /* * Handle debug exception notifications. @@ -313,7 +421,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) static int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; - struct hw_breakpoint *bp; + struct perf_event *bp; unsigned long dr7, dr6; unsigned long *dr6_p; @@ -325,10 +433,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - /* Lazy debug register switching */ - if (!test_tsk_thread_flag(current, TIF_DEBUG)) - arch_uninstall_thread_hw_breakpoint(); - get_debugreg(dr7, 7); /* Disable breakpoints during exception handling */ set_debugreg(0UL, 7); @@ -344,17 +448,18 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) for (i = 0; i < HBP_NUM; ++i) { if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; + /* - * Find the corresponding hw_breakpoint structure and - * invoke its triggered callback. + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. */ - if (i >= hbp_kernel_pos) - bp = per_cpu(this_hbp_kernel[i], cpu); - else { - bp = current->thread.hbp[i]; - if (bp) - rc = NOTIFY_DONE; - } + rcu_read_lock(); + + bp = per_cpu(bp_per_reg[i], cpu); + if (bp) + rc = NOTIFY_DONE; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -362,19 +467,23 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) (*dr6_p) &= ~(DR_TRAP0 << i); /* * bp can be NULL due to lazy debug register switching - * or due to the delay between updates of hbp_kernel_pos - * and this_hbp_kernel. + * or due to concurrent perf counter removing. */ - if (!bp) - continue; + if (!bp) { + rcu_read_unlock(); + break; + } + + (bp->callback)(bp, args->regs); - (bp->triggered)(bp, args->regs); + rcu_read_unlock(); } if (dr6 & (~DR_TRAP_BITS)) rc = NOTIFY_DONE; set_debugreg(dr7, 7); put_cpu(); + return rc; } @@ -389,3 +498,13 @@ int __kprobes hw_breakpoint_exceptions_notify( return hw_breakpoint_handler(data); } + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} + +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index cf8ee0016307..744508e7cfdd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -18,7 +19,6 @@ #include #include #include -#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -47,8 +47,6 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - flush_thread_hw_breakpoint(tsk); WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } @@ -107,8 +105,7 @@ void flush_thread(void) } #endif - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - flush_thread_hw_breakpoint(tsk); + flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 209e74801763..d5bd3132ee70 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -59,7 +59,6 @@ #include #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -264,9 +263,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - if (copy_thread_hw_breakpoint(tsk, p, clone_flags)) - goto out; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, @@ -287,13 +285,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); -out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } - if (err) - flush_thread_hw_breakpoint(p); clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); p->thread.ds_ctx = NULL; @@ -437,23 +432,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) lazy_load_gs(next->gs); percpu_write(current_task, next_p); - /* - * There's a problem with moving the arch_install_thread_hw_breakpoint() - * call before current is updated. Suppose a kernel breakpoint is - * triggered in between the two, the hw-breakpoint handler will see that - * the 'current' task does not have TIF_DEBUG flag set and will think it - * is leftover from an old task (lazy switching) and will erase it. Then - * until the next context switch, no user-breakpoints will be installed. - * - * The real problem is that it's impossible to update both current and - * physical debug registers at the same instant, so there will always be - * a window in which they disagree and a breakpoint might get triggered. - * Since we use lazy switching, we are forced to assume that a - * disagreement means that current is correct and the exception is due - * to lazy debug register switching. - */ - if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) - arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 72edac026a78..5bafdec34441 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,7 +53,6 @@ #include #include #include -#include asmlinkage extern void ret_from_fork(void); @@ -244,8 +243,6 @@ void release_thread(struct task_struct *dead_task) BUG(); } } - if (unlikely(dead_task->thread.debugreg7)) - flush_thread_hw_breakpoint(dead_task); } static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) @@ -309,9 +306,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, savesegment(ds, p->thread.ds); err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG))) - if (copy_thread_hw_breakpoint(me, p, clone_flags)) - goto out; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); @@ -351,8 +346,6 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } - if (err) - flush_thread_hw_breakpoint(p); return err; } @@ -508,23 +501,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (preload_fpu) __math_state_restore(); - /* - * There's a problem with moving the arch_install_thread_hw_breakpoint() - * call before current is updated. Suppose a kernel breakpoint is - * triggered in between the two, the hw-breakpoint handler will see that - * the 'current' task does not have TIF_DEBUG flag set and will think it - * is leftover from an old task (lazy switching) and will erase it. Then - * until the next context switch, no user-breakpoints will be installed. - * - * The real problem is that it's impossible to update both current and - * physical debug registers at the same instant, so there will always be - * a window in which they disagree and a breakpoint might get triggered. - * Since we use lazy switching, we are forced to assume that a - * disagreement means that current is correct and the exception is due - * to lazy debug register switching. - */ - if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) - arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 267cb85b479c..e79610d95971 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -441,54 +443,59 @@ static int genregs_set(struct task_struct *target, return ret; } -/* - * Decode the length and type bits for a particular breakpoint as - * stored in debug register 7. Return the "enabled" status. - */ -static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, - unsigned *type) -{ - int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); - - *len = (bp_info & 0xc) | 0x40; - *type = (bp_info & 0x3) | 0x80; - return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; -} - -static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) +static void ptrace_triggered(struct perf_event *bp, void *data) { - struct thread_struct *thread = &(current->thread); int i; + struct thread_struct *thread = &(current->thread); /* * Store in the virtual DR6 register the fact that the breakpoint * was hit so the thread's debugger will see it. */ - for (i = 0; i < hbp_kernel_pos; i++) - /* - * We will check bp->info.address against the address stored in - * thread's hbp structure and not debugreg[i]. This is to ensure - * that the corresponding bit for 'i' in DR7 register is enabled - */ - if (bp->info.address == thread->hbp[i]->info.address) + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) break; + } thread->debugreg6 |= (DR_TRAP0 << i); } +/* + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * + */ +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) +{ + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } + } + + return dr7; +} + /* * Handle ptrace writes to debug register 7. */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { struct thread_struct *thread = &(tsk->thread); - unsigned long old_dr7 = thread->debugreg7; + unsigned long old_dr7; int i, orig_ret = 0, rc = 0; int enabled, second_pass = 0; unsigned len, type; - struct hw_breakpoint *bp; + int gen_len, gen_type; + struct perf_event *bp; data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: /* * Loop through all the hardware breakpoints, making the @@ -496,11 +503,12 @@ restore: */ for (i = 0; i < HBP_NUM; i++) { enabled = decode_dr7(data, i, &len, &type); - bp = thread->hbp[i]; + bp = thread->ptrace_bps[i]; if (!enabled) { if (bp) { - /* Don't unregister the breakpoints right-away, + /* + * Don't unregister the breakpoints right-away, * unless all register_user_hw_breakpoint() * requests have succeeded. This prevents * any window of opportunity for debug @@ -508,27 +516,45 @@ restore: */ if (!second_pass) continue; - unregister_user_hw_breakpoint(tsk, bp); - kfree(bp); + thread->ptrace_bps[i] = NULL; + unregister_hw_breakpoint(bp); } continue; } + + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ if (!bp) { - rc = -ENOMEM; - bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); - if (bp) { - bp->info.address = thread->debugreg[i]; - bp->triggered = ptrace_triggered; - bp->info.len = len; - bp->info.type = type; - rc = register_user_hw_breakpoint(tsk, bp); - if (rc) - kfree(bp); - } - } else - rc = modify_user_hw_breakpoint(tsk, bp); + rc = -EINVAL; + break; + } + + rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (rc) break; + + /* + * This is a temporary thing as bp is unregistered/registered + * to simulate modification + */ + bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len, + gen_type, bp->callback, + tsk, true); + thread->ptrace_bps[i] = NULL; + + if (!bp) { /* incorrect bp, or we have a bug in bp API */ + rc = -EINVAL; + break; + } + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + bp = NULL; + break; + } + thread->ptrace_bps[i] = bp; } /* * Make a second pass to free the remaining unused breakpoints @@ -553,15 +579,63 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) struct thread_struct *thread = &(tsk->thread); unsigned long val = 0; - if (n < HBP_NUM) - val = thread->debugreg[n]; - else if (n == 6) + if (n < HBP_NUM) { + struct perf_event *bp; + bp = thread->ptrace_bps[n]; + if (!bp) + return 0; + val = bp->hw.info.address; + } else if (n == 6) { val = thread->debugreg6; - else if (n == 7) - val = thread->debugreg7; + } else if (n == 7) { + val = ptrace_get_dr7(thread->ptrace_bps); + } return val; } +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct perf_event *bp; + struct thread_struct *t = &tsk->thread; + + if (!t->ptrace_bps[nr]) { + /* + * Put stub len and type to register (reserve) an inactive but + * correct bp + */ + bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1, + HW_BREAKPOINT_W, + ptrace_triggered, tsk, + false); + } else { + bp = t->ptrace_bps[nr]; + t->ptrace_bps[nr] = NULL; + bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len, + bp->attr.bp_type, + bp->callback, + tsk, + bp->attr.disabled); + } + + if (!bp) + return -EIO; + /* + * CHECKME: the previous code returned -EIO if the addr wasn't a + * valid task virtual addr. The new one will return -EINVAL in this + * case. + * -EINVAL may be what we want for in-kernel breakpoints users, but + * -EIO looks better for ptrace, since we refuse a register writing + * for the user. And anyway this is the previous behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; + + return 0; +} + /* * Handle PTRACE_POKEUSR calls for the debug register area. */ @@ -575,19 +649,13 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) return -EIO; if (n == 6) { - tsk->thread.debugreg6 = val; + thread->debugreg6 = val; goto ret_path; } if (n < HBP_NUM) { - if (thread->hbp[n]) { - if (arch_check_va_in_userspace(val, - thread->hbp[n]->info.len) == 0) { - rc = -EIO; - goto ret_path; - } - thread->hbp[n]->info.address = val; - } - thread->debugreg[n] = val; + rc = ptrace_set_breakpoint_addr(tsk, n, val); + if (rc) + return rc; } /* All that's left is DR7 */ if (n == 7) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 213a7a3e4562..565ebc65920e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include @@ -328,7 +327,6 @@ notrace static void __cpuinit start_secondary(void *unused) x86_cpuinit.setup_percpu_clockev(); wmb(); - load_debug_registers(); cpu_idle(); } @@ -1269,7 +1267,6 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); - hw_breakpoint_disable(); } int native_cpu_disable(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc2974adf9b6..22dee7aa7813 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -42,6 +42,7 @@ #define CREATE_TRACE_POINTS #include "trace.h" +#include #include #include #include @@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); - if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); - } + /* + * If the guest has used debug registers, at least dr7 + * will be disabled while returning to the host. + * If we don't have active breakpoints in the host, we don't + * care about the messed up debug address registers. But if + * we have some of them active, restore the old state. + */ + if (__get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK) + hw_breakpoint_restore(); set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index e09a44fc4664..0a979f3e5b8a 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -105,7 +105,6 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); #endif - hw_breakpoint_disable(); } /* Needed by apm.c */ @@ -144,11 +143,6 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - load_debug_registers(); } /** -- cgit v1.2.2 From 46dc281b1bb02527195fe2ad50a3af6d7f7f7325 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 8 Nov 2009 18:53:56 +0300 Subject: x86, apic: Use PAGE_SIZE instead of numbers The whole page is reserved for IO-APIC fixmap due to non-cacheable requirement. So lets note this explicitly instead of playing with numbers. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Maciej W. Rozycki LKML-Reference: <20091108155356.GB25940@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 31e9db3c12ad..9ee1c1628c17 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4111,7 +4111,7 @@ fake_ioapic_page: idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res->end = ioapic_phys + PAGE_SIZE-1; ioapic_res++; } } -- cgit v1.2.2 From 4343fe1024e09e17667f95620ed3e69a7a5f4389 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 8 Nov 2009 18:54:31 +0300 Subject: x86, ioapic: Use snrpintf while set names for IO-APIC resourses We should be ready that one day MAX_IO_APICS may raise its number. To prevent memory overwrite we're to use safe snprintf while set IO-APIC resourse name. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20091108155431.GC25940@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ee1c1628c17..24d1458a1822 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4066,7 +4066,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) for (i = 0; i < nr_ioapics; i++) { res[i].name = mem; res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; } -- cgit v1.2.2 From f4a70c55376683213229af7266dc57ad81aee354 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 8 Nov 2009 16:16:45 +0300 Subject: x86, apic: Get rid of apicid_to_cpu_present assign on 64-bit In fact it's never get used on x86-64 (for 64 bit platform we use differ technique to enumerate io-units). Reported-by: Stephen Rothwell Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra LKML-Reference: <20091108131645.GD5300@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_noop.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 9ab6ffb313ac..89629f622b60 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -162,7 +162,12 @@ struct apic apic_noop = { .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, + +#ifdef CONFIG_X86_32 .apicid_to_cpu_present = default_apicid_to_cpu_present, +#else + .apicid_to_cpu_present = NULL, +#endif .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, -- cgit v1.2.2 From fd650a6394b3242edf125ba9c4d500349a6d7178 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 9 Nov 2009 13:52:26 -0500 Subject: x86: Generate .byte code for some new instructions via gas macro It will take some time for binutils (gas) to support some newly added instructions, such as SSE4.1 instructions or the AES-NI instructions found in upcoming Intel CPU. To make the source code can be compiled by old binutils, .byte code is used instead of the assembly instruction. But the readability and flexibility of raw .byte code is not good. This patch solves the issue of raw .byte code via generating it via assembly instruction like gas macro. The syntax is as close as possible to real assembly instruction. Some helper macros such as MODRM is not a full feature implementation. It can be extended when necessary. Signed-off-by: Huang Ying Acked-by: H. Peter Anvin Signed-off-by: Herbert Xu --- arch/x86/include/asm/inst.h | 150 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 arch/x86/include/asm/inst.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h new file mode 100644 index 000000000000..14cf526091f9 --- /dev/null +++ b/arch/x86/include/asm/inst.h @@ -0,0 +1,150 @@ +/* + * Generate .byte code for some instructions not supported by old + * binutils. + */ +#ifndef X86_ASM_INST_H +#define X86_ASM_INST_H + +#ifdef __ASSEMBLY__ + + .macro XMM_NUM opd xmm + .ifc \xmm,%xmm0 + \opd = 0 + .endif + .ifc \xmm,%xmm1 + \opd = 1 + .endif + .ifc \xmm,%xmm2 + \opd = 2 + .endif + .ifc \xmm,%xmm3 + \opd = 3 + .endif + .ifc \xmm,%xmm4 + \opd = 4 + .endif + .ifc \xmm,%xmm5 + \opd = 5 + .endif + .ifc \xmm,%xmm6 + \opd = 6 + .endif + .ifc \xmm,%xmm7 + \opd = 7 + .endif + .ifc \xmm,%xmm8 + \opd = 8 + .endif + .ifc \xmm,%xmm9 + \opd = 9 + .endif + .ifc \xmm,%xmm10 + \opd = 10 + .endif + .ifc \xmm,%xmm11 + \opd = 11 + .endif + .ifc \xmm,%xmm12 + \opd = 12 + .endif + .ifc \xmm,%xmm13 + \opd = 13 + .endif + .ifc \xmm,%xmm14 + \opd = 14 + .endif + .ifc \xmm,%xmm15 + \opd = 15 + .endif + .endm + + .macro PFX_OPD_SIZE + .byte 0x66 + .endm + + .macro PFX_REX opd1 opd2 + .if (\opd1 | \opd2) & 8 + .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) + .endif + .endm + + .macro MODRM mod opd1 opd2 + .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) + .endm + + .macro PSHUFB_XMM xmm1 xmm2 + XMM_NUM pshufb_opd1 \xmm1 + XMM_NUM pshufb_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX pshufb_opd1 pshufb_opd2 + .byte 0x0f, 0x38, 0x00 + MODRM 0xc0 pshufb_opd1 pshufb_opd2 + .endm + + .macro PCLMULQDQ imm8 xmm1 xmm2 + XMM_NUM clmul_opd1 \xmm1 + XMM_NUM clmul_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX clmul_opd1 clmul_opd2 + .byte 0x0f, 0x3a, 0x44 + MODRM 0xc0 clmul_opd1 clmul_opd2 + .byte \imm8 + .endm + + .macro AESKEYGENASSIST rcon xmm1 xmm2 + XMM_NUM aeskeygen_opd1 \xmm1 + XMM_NUM aeskeygen_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aeskeygen_opd1 aeskeygen_opd2 + .byte 0x0f, 0x3a, 0xdf + MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2 + .byte \rcon + .endm + + .macro AESIMC xmm1 xmm2 + XMM_NUM aesimc_opd1 \xmm1 + XMM_NUM aesimc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesimc_opd1 aesimc_opd2 + .byte 0x0f, 0x38, 0xdb + MODRM 0xc0 aesimc_opd1 aesimc_opd2 + .endm + + .macro AESENC xmm1 xmm2 + XMM_NUM aesenc_opd1 \xmm1 + XMM_NUM aesenc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenc_opd1 aesenc_opd2 + .byte 0x0f, 0x38, 0xdc + MODRM 0xc0 aesenc_opd1 aesenc_opd2 + .endm + + .macro AESENCLAST xmm1 xmm2 + XMM_NUM aesenclast_opd1 \xmm1 + XMM_NUM aesenclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenclast_opd1 aesenclast_opd2 + .byte 0x0f, 0x38, 0xdd + MODRM 0xc0 aesenclast_opd1 aesenclast_opd2 + .endm + + .macro AESDEC xmm1 xmm2 + XMM_NUM aesdec_opd1 \xmm1 + XMM_NUM aesdec_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdec_opd1 aesdec_opd2 + .byte 0x0f, 0x38, 0xde + MODRM 0xc0 aesdec_opd1 aesdec_opd2 + .endm + + .macro AESDECLAST xmm1 xmm2 + XMM_NUM aesdeclast_opd1 \xmm1 + XMM_NUM aesdeclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdeclast_opd1 aesdeclast_opd2 + .byte 0x0f, 0x38, 0xdf + MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 + .endm +#endif + +#endif -- cgit v1.2.2 From 6e18da75c28b592594fd632cf3e6eb09d3d078de Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 29 Oct 2009 14:47:42 +0100 Subject: x86, amd-ucode: Remove needless log messages Signed-off-by: Andreas Herrmann Cc: Borislav Petkov LKML-Reference: <20091029134742.GD30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index f4c538b681ca..c043534fd986 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -109,12 +109,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 0; } - if (mc_header->processor_rev_id != equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d: patch mismatch " - "(processor_rev_id: %x, equiv_cpu_id: %x)\n", - cpu, mc_header->processor_rev_id, equiv_cpu_id); + if (mc_header->processor_rev_id != equiv_cpu_id) return 0; - } /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { @@ -185,9 +181,6 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - printk(KERN_DEBUG "microcode: size %u, total_size %u\n", - size, total_size); - if (total_size > size || total_size > UCODE_MAX_SIZE) { printk(KERN_ERR "microcode: error: size mismatch\n"); return NULL; -- cgit v1.2.2 From 7abc07531383ac7f727cc9d44e1360a829f2082e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 10 Nov 2009 01:06:59 +0300 Subject: x86: apic: Do not use stacked physid_mask_t We should not use physid_mask_t as a stack based variable in apic code. This type depends on MAX_APICS parameter which may be huge enough. Especially it became a problem with apic NOOP driver which is portable between 32 bit and 64 bit environment (where we have really huge MAX_APICS). So apic driver should operate with pointers and a caller in turn should aware of allocation physid_mask_t variable. As a side (but positive) effect -- we may use already implemented physid_set_mask_of_physid function eliminating default_apicid_to_cpu_present completely. Note that physids_coerce and physids_promote turned into static inline from macro (since macro hides the fact that parameter is being interpreted as unsigned long, make it explicit). Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Maciej W. Rozycki Cc: Stephen Rothwell LKML-Reference: <20091109220659.GA5568@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 19 +++++++------------ arch/x86/include/asm/mpspec.h | 16 +++++++++------- arch/x86/kernel/apic/apic_noop.c | 18 ++++-------------- arch/x86/kernel/apic/bigsmp_32.c | 13 ++++--------- arch/x86/kernel/apic/es7000_32.c | 16 ++++++---------- arch/x86/kernel/apic/io_apic.c | 14 +++++++------- arch/x86/kernel/apic/numaq_32.c | 13 ++++++------- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 10 +++++----- arch/x86/kernel/visws_quirks.c | 2 +- 10 files changed, 50 insertions(+), 73 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 08a5f420e07b..b4ac2cdcb64f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -297,20 +297,20 @@ struct apic { int disable_esr; int dest_logical; - unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); + unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); - physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); + void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); void (*setup_apic_routing)(void); int (*multi_timer_check)(int apic, int irq); int (*apicid_to_node)(int logical_apicid); int (*cpu_to_logical_apicid)(int cpu); int (*cpu_present_to_apicid)(int mps_cpu); - physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); + void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); void (*setup_portio_remap)(void); int (*check_phys_apicid_present)(int phys_apicid); void (*enable_apic_mode)(void); @@ -534,9 +534,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return (unsigned int)(mask1 & mask2 & mask3); } -static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) +static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static inline unsigned long default_check_apicid_present(int bit) @@ -544,9 +544,9 @@ static inline unsigned long default_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map) +static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { - return phys_map; + *retmap = *phys_map; } /* Mapping from cpu number to logical apicid */ @@ -585,11 +585,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu); extern int default_check_phys_apicid_present(int phys_apicid); #endif -static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) -{ - return physid_mask_of_physid(phys_apicid); -} - #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 79c94500c0bb..61d90b1331c3 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -163,14 +163,16 @@ typedef struct physid_mask physid_mask_t; #define physids_shift_left(d, s, n) \ bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) -#define physids_coerce(map) ((map).mask[0]) +static inline unsigned long physids_coerce(physid_mask_t *map) +{ + return map->mask[0]; +} -#define physids_promote(physids) \ - ({ \ - physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ - __physid_mask.mask[0] = physids; \ - __physid_mask; \ - }) +static inline void physids_promote(unsigned long physids, physid_mask_t *map) +{ + physids_clear(*map); + map->mask[0] = physids; +} /* Note: will create very large stack frames if physid_mask_t is big */ #define physid_mask_of_physid(physid) \ diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 89629f622b60..d9acc3bee0f4 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void) return 0; } -static physid_mask_t noop_ioapic_phys_id_map(physid_mask_t phys_map) -{ - return phys_map; -} - static int noop_cpu_to_logical_apicid(int cpu) { return 0; @@ -100,9 +95,9 @@ static const struct cpumask *noop_target_cpus(void) return cpumask_of(0); } -static unsigned long noop_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static unsigned long noop_check_apicid_present(int bit) @@ -155,19 +150,14 @@ struct apic apic_noop = { .vector_allocation_domain = noop_vector_allocation_domain, .init_apic_ldr = noop_init_apic_ldr, - .ioapic_phys_id_map = noop_ioapic_phys_id_map, + .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, .multi_timer_check = NULL, .apicid_to_node = noop_apicid_to_node, .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, - -#ifdef CONFIG_X86_32 - .apicid_to_cpu_present = default_apicid_to_cpu_present, -#else - .apicid_to_cpu_present = NULL, -#endif + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 77a06413b6b2..38dcecfa5818 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void) #endif } -static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid) -{ - return physid_mask_of_physid(phys_apicid); -} - /* Mapping from cpu number to logical apicid */ static inline int bigsmp_cpu_to_logical_apicid(int cpu) { @@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu) return cpu_physical_id(cpu); } -static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) +static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xFFL); + physids_promote(0xFFL, retmap); } static int bigsmp_check_phys_apicid_present(int phys_apicid) @@ -230,7 +225,7 @@ struct apic apic_bigsmp = { .apicid_to_node = bigsmp_apicid_to_node, .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, - .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = bigsmp_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 89174f847b49..e85f8fb7f8e7 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -466,11 +466,11 @@ static const struct cpumask *es7000_target_cpus(void) return cpumask_of(smp_processor_id()); } -static unsigned long -es7000_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } + static unsigned long es7000_check_apicid_present(int bit) { return physid_isset(bit, phys_cpu_present_map); @@ -539,14 +539,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu) static int cpu_id; -static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) +static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) { - physid_mask_t mask; - - mask = physid_mask_of_physid(cpu_id); + physid_set_mask_of_physid(cpu_id, retmap); ++cpu_id; - - return mask; } /* Mapping from cpu number to logical apicid */ @@ -561,10 +557,10 @@ static int es7000_cpu_to_logical_apicid(int cpu) #endif } -static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) +static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xff); + physids_promote(0xFFL, retmap); } static int es7000_check_phys_apicid_present(int cpu_physical_apicid) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 24d1458a1822..20ea8392bc57 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2031,7 +2031,7 @@ void __init setup_ioapic_ids_from_mpc(void) * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. @@ -2058,7 +2058,7 @@ void __init setup_ioapic_ids_from_mpc(void) * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(phys_id_present_map, + if (apic->check_apicid_used(&phys_id_present_map, mp_ioapics[apic_id].apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic_id, mp_ioapics[apic_id].apicid); @@ -2073,7 +2073,7 @@ void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid = i; } else { physid_mask_t tmp; - tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); + apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", mp_ioapics[apic_id].apicid); @@ -3904,7 +3904,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) */ if (physids_empty(apic_id_map)) - apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); @@ -3920,10 +3920,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(apic_id_map, apic_id)) { + if (apic->check_apicid_used(&apic_id_map, apic_id)) { for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(apic_id_map, i)) + if (!apic->check_apicid_used(&apic_id_map, i)) break; } @@ -3936,7 +3936,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) apic_id = i; } - tmp = apic->apicid_to_cpu_present(apic_id); + apic->apicid_to_cpu_present(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index efa00e2b8505..07cdbdcd7a92 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -334,10 +334,9 @@ static inline const struct cpumask *numaq_target_cpus(void) return cpu_all_mask; } -static inline unsigned long -numaq_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static inline unsigned long numaq_check_apicid_present(int bit) @@ -371,10 +370,10 @@ static inline int numaq_multi_timer_check(int apic, int irq) return apic != 0 && irq == 0; } -static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) +static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* We don't have a good way to do this yet - hack */ - return physids_promote(0xFUL); + return physids_promote(0xFUL, retmap); } static inline int numaq_cpu_to_logical_apicid(int cpu) @@ -402,12 +401,12 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } -static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) +static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); int cpu = __ffs(logical_apicid & 0xf); - return physid_mask_of_physid(cpu + 4*node); + physid_set_mask_of_physid(cpu + 4*node, retmap); } /* Where the IO area was mapped on multiquad, always 0 otherwise */ diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0c0182cc947d..1a6559f6768c 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -108,7 +108,7 @@ struct apic apic_default = { .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = default_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 645ecc4ff0be..9b419263d90d 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void) return cpumask_of(0); } -static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) +static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0x0F); + physids_promote(0x0FL, retmap); } -static physid_mask_t summit_apicid_to_cpu_present(int apicid) +static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap) { - return physid_mask_of_physid(0); + physid_set_mask_of_physid(0, retmap); } static int summit_check_phys_apicid_present(int physical_apicid) diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index f068553a1b17..cff70c86e18e 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apic_cpus = apic->apicid_to_cpu_present(m->apicid); + apic->apicid_to_cpu_present(m->apicid, &apic_cpus); physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); /* * Validate version -- cgit v1.2.2 From a2202aa29289db64ca7988b12343158b67b27f10 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Tue, 10 Nov 2009 09:38:24 +0800 Subject: x86: Under BIOS control, restore AP's APIC_LVTTHMR to the BSP value On platforms where the BIOS handles the thermal monitor interrupt, APIC_LVTTHMR on each logical CPU is programmed to generate a SMI and OS must not touch it. Unfortunately AP bringup sequence using INIT-SIPI-SIPI clears all the LVT entries except the mask bit. Essentially this results in all LVT entries including the thermal monitoring interrupt set to masked (clearing the bios programmed value for APIC_LVTTHMR). And this leads to kernel take over the thermal monitoring interrupt on AP's but not on BSP (leaving the bios programmed value only on BSP). As a result of this, we have seen system hangs when the thermal monitoring interrupt is generated. Fix this by reading the initial value of thermal LVT entry on BSP and if bios has taken over the control, then program the same value on all AP's and leave the thermal monitoring interrupt control on all the logical cpu's to the bios. Signed-off-by: Yong Wang Reviewed-by: Suresh Siddha Cc: Borislav Petkov Cc: Arjan van de Ven LKML-Reference: <20091110013824.GA24940@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar Cc: stable@kernel.org --- arch/x86/include/asm/mce.h | 9 +++++++++ arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- arch/x86/kernel/cpu/mcheck/therm_throt.c | 29 ++++++++++++++++++++++++++++- arch/x86/kernel/setup.c | 3 +++ 5 files changed, 43 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 161485da6838..858baa061cfc 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -120,8 +120,10 @@ extern int mce_disabled; extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE +int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); #else +static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} #endif @@ -215,5 +217,12 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); void mce_log_therm_throt_event(__u64 status); + +#ifdef CONFIG_X86_THERMAL_VECTOR +extern void mcheck_intel_therm_init(void); +#else +static inline void mcheck_intel_therm_init(void) { } +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4df69a38be57..9053be5d95cd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -837,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } -#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); -#endif select_idle_routine(c); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 80801705edd7..0d4102031a4c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1655,13 +1655,14 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); -static int __init mcheck_init(void) +int __init mcheck_init(void) { atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + mcheck_intel_therm_init(); + return 0; } -early_initcall(mcheck_init); /* * Sysfs support diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index b3a1dba75330..7f3cf36ed124 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); +static u32 lvtthmr_init __read_mostly; + #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) @@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +void mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && + cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); + + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * Always restore the value that BIOS has programmed on AP based on + * BSP's info we saved since BIOS is always setting the same value + * for all threads/cores + */ + apic_write(APIC_LVTTHMR, lvtthmr_init); + + h = lvtthmr_init; + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e09f0e2c14b5..179c1f2aa457 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -109,6 +109,7 @@ #ifdef CONFIG_X86_64 #include #endif +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -1024,6 +1025,8 @@ void __init setup_arch(char **cmdline_p) #endif #endif x86_init.oem.banner(); + + mcheck_init(); } #ifdef CONFIG_X86_32 -- cgit v1.2.2 From 41855b77547fa18d90ed6a5d322983d3fdab1959 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 9 Nov 2009 17:58:50 -0800 Subject: x86: GART: pci-gart_64.c: Use correct length in strncmp Signed-off-by: Joe Perches Cc: # .3x.x LKML-Reference: <1257818330.12852.72.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a9bcdf7c8801..eb46ab3f52b2 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -858,7 +858,7 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) + if (!strncmp(p, "fullflush", 9)) iommu_fullflush = 1; if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; -- cgit v1.2.2 From 83ea05ea69290b2e30da795527dbe304db1e2331 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 10 Nov 2009 17:23:07 +0800 Subject: x86: pat: Clean up req_type special case for reserve_memtype() Commit: b6ff32d: x86, PAT: Consolidate code in pat_x_mtrr_type() and reserve_memtype() consolidated code in pat_x_mtrr_type() and reserve_memtype(), which removed the special case (req_type is -1) for the PAT-enabled part. We should also change comments and the PAT-disabled part. Signed-off-by: Xiaotian Feng Cc: Suresh Siddha Cc: Venkatesh Pallipadi LKML-Reference: <1257844987-7906-1-git-send-email-dfeng@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e78cd0ec2bcf..81fb75344cd3 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -355,9 +355,6 @@ static int free_ram_pages_type(u64 start, u64 end) * - _PAGE_CACHE_UC_MINUS * - _PAGE_CACHE_UC * - * req_type will have a special case value '-1', when requester want to inherit - * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. - * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error @@ -377,9 +374,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { - if (req_type == -1) - *new_type = _PAGE_CACHE_WB; - else if (req_type == _PAGE_CACHE_WC) + if (req_type == _PAGE_CACHE_WC) *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; -- cgit v1.2.2 From 2fb8f4e6a83dcaec15c1dd0ee8a6f618e7ece7f0 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 10 Nov 2009 17:23:25 +0800 Subject: x86: pat: Remove ioremap_default() Commit: b6ff32d: x86, PAT: Consolidate code in pat_x_mtrr_type() and reserve_memtype() consolidated reserve_memtype() and pat_x_mtrr_type, this made ioremap_default() same as ioremap_cache(). Remove the redundant function and change the only caller to use ioremap_cache. Signed-off-by: Xiaotian Feng Cc: Suresh Siddha Cc: Venkatesh Pallipadi LKML-Reference: <1257845005-7938-1-git-send-email-dfeng@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 334e63ca7b2b..3af10dee0147 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -283,30 +283,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) } EXPORT_SYMBOL(ioremap_cache); -static void __iomem *ioremap_default(resource_size_t phys_addr, - unsigned long size) -{ - unsigned long flags; - void __iomem *ret; - int err; - - /* - * - WB for WB-able memory and no other conflicting mappings - * - UC_MINUS for non-WB-able memory with no other conflicting mappings - * - Inherit from confliting mappings otherwise - */ - err = reserve_memtype(phys_addr, phys_addr + size, - _PAGE_CACHE_WB, &flags); - if (err < 0) - return NULL; - - ret = __ioremap_caller(phys_addr, size, flags, - __builtin_return_address(0)); - - free_memtype(phys_addr, phys_addr + size); - return ret; -} - void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, unsigned long prot_val) { @@ -382,7 +358,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - addr = (void __force *)ioremap_default(start, PAGE_SIZE); + addr = (void __force *)ioremap_cache(start, PAGE_SIZE); if (addr) addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); -- cgit v1.2.2 From 9f6b3c2c30cfbb1166ce7e74a8f9fd93ae19d2de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 9 Nov 2009 21:03:43 +0100 Subject: hw-breakpoints: Fix broken a.out format dump Fix the broken a.out format dump. For now we only dump the ptrace breakpoints. TODO: Dump every perf breakpoints for the current thread, not only ptrace based ones. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: "K. Prasad" --- arch/x86/include/asm/a.out-core.h | 10 ++-------- arch/x86/include/asm/debugreg.h | 2 ++ arch/x86/kernel/hw_breakpoint.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index fc4685dd6e4d..7a15588e45d4 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -17,6 +17,7 @@ #include #include +#include /* * fill in the user structure for an a.out core dump @@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg[0]; - dump->u_debugreg[1] = current->thread.debugreg[1]; - dump->u_debugreg[2] = current->thread.debugreg[2]; - dump->u_debugreg[3] = current->thread.debugreg[3]; - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - dump->u_debugreg[7] = current->thread.debugreg7; + aout_dump_debugregs(dump); if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 9a3333c91f9a..f1b673f08239 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -89,6 +89,8 @@ static inline void hw_breakpoint_disable(void) set_debugreg(0UL, 3); } +extern void aout_dump_debugregs(struct user *dump); + #ifdef CONFIG_KVM extern void hw_breakpoint_restore(void); #endif diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index e622620790bd..57dcee5fa958 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -375,6 +375,41 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return 0; } +/* + * Dump the debug register contents to the user. + * We can't dump our per cpu values because it + * may contain cpu wide breakpoint, something that + * doesn't belong to the current task. + * + * TODO: include non-ptrace user breakpoints (perf) + */ +void aout_dump_debugregs(struct user *dump) +{ + int i; + int dr7 = 0; + struct perf_event *bp; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = ¤t->thread; + + for (i = 0; i < HBP_NUM; i++) { + bp = thread->ptrace_bps[i]; + + if (bp && !bp->attr.disabled) { + dump->u_debugreg[i] = bp->attr.bp_addr; + info = counter_arch_bp(bp); + dr7 |= encode_dr7(i, info->len, info->type); + } else { + dump->u_debugreg[i] = 0; + } + } + + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + + dump->u_debugreg[7] = dr7; +} + /* * Release the user breakpoints used by ptrace */ -- cgit v1.2.2 From 59d8eb53ea9947db7cad8ebc31b0fb54f23a9851 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Nov 2009 11:03:12 +0100 Subject: hw-breakpoints: Wrap in the KVM breakpoint active state check Wrap in the cpu dr7 check that tells if we have active breakpoints that need to be restored in the cpu. This wrapper makes the check more self-explainable and also reusable for any further other uses. Reported-by: Jan Kiszka Signed-off-by: Frederic Weisbecker Cc: Avi Kivity Cc: "K. Prasad" --- arch/x86/include/asm/debugreg.h | 5 +++++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index f1b673f08239..0f6e92af4227 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -89,6 +89,11 @@ static inline void hw_breakpoint_disable(void) set_debugreg(0UL, 3); } +static inline int hw_breakpoint_active(void) +{ + return __get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK; +} + extern void aout_dump_debugregs(struct user *dump); #ifdef CONFIG_KVM diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 22dee7aa7813..3817220cc86b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3651,7 +3651,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * care about the messed up debug address registers. But if * we have some of them active, restore the old state. */ - if (__get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK) + if (hw_breakpoint_active()) hw_breakpoint_restore(); set_bit(KVM_REQ_KICK, &vcpu->requests); -- cgit v1.2.2 From d1c84f79a6ba992dc01e312c44a21496303874d6 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Nov 2009 12:07:23 +0100 Subject: x86: ucode-amd: Load ucode-patches once and not separately of each CPU This also implies that corresponding log messages, e.g. platform microcode: firmware: requesting amd-ucode/microcode_amd.bin show up only once on module load and not when ucode is updated for each CPU. Signed-off-by: Andreas Herrmann Cc: dimm LKML-Reference: <20091110110723.GH30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode.h | 2 ++ arch/x86/kernel/microcode_amd.c | 24 +++++++++++++++++------- arch/x86/kernel/microcode_core.c | 6 ++++++ 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index ef51b501e22a..c24ca9a56458 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -12,6 +12,8 @@ struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; struct microcode_ops { + void (*init)(struct device *device); + void (*fini)(void); enum ucode_state (*request_microcode_user) (int cpu, const void __user *buf, size_t size); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c043534fd986..75538f647193 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -33,6 +33,8 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 +const struct firmware *firmware; + struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -301,14 +303,10 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { - const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; enum ucode_state ret; - if (request_firmware(&firmware, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); + if (firmware == NULL) return UCODE_NFOUND; - } if (*(u32 *)firmware->data != UCODE_MAGIC) { printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", @@ -318,8 +316,6 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) ret = generic_load_microcode(cpu, firmware->data, firmware->size); - release_firmware(firmware); - return ret; } @@ -339,7 +335,21 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +void init_microcode_amd(struct device *device) +{ + const char *fw_name = "amd-ucode/microcode_amd.bin"; + if (request_firmware(&firmware, fw_name, device)) + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); +} + +void fini_microcode_amd(void) +{ + release_firmware(firmware); +} + static struct microcode_ops microcode_amd_ops = { + .init = init_microcode_amd, + .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 378e9a8f1bf8..d2a816021d9f 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -520,6 +520,9 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } + if (microcode_ops->init) + microcode_ops->init(µcode_pdev->dev); + get_online_cpus(); mutex_lock(µcode_mutex); @@ -563,6 +566,9 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); + if (microcode_ops->fini) + microcode_ops->fini(); + microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -- cgit v1.2.2 From 14c569425a0ae12cbeed72fdb8ebe78c48455dfd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Nov 2009 12:08:25 +0100 Subject: x86: ucode-amd: Don't warn when no ucode is available for a CPU revision There is no point in warning when there is no ucode available for a specific CPU revision. Currently the container-file, which provides the AMD ucode patches for OS load, contains only a few ucode patches. It's already clearly indicated by the printed patch_level whenever new ucode was available and an update happened. So the warning message is of no help but rather annoying on systems with many CPUs. Signed-off-by: Andreas Herrmann Cc: dimm LKML-Reference: <20091110110825.GI30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 75538f647193..9f13324054ce 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -105,11 +105,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) i++; } - if (!equiv_cpu_id) { - printk(KERN_WARNING "microcode: CPU%d: cpu revision " - "not listed in equivalent cpu table\n", cpu); + if (!equiv_cpu_id) return 0; - } if (mc_header->processor_rev_id != equiv_cpu_id) return 0; -- cgit v1.2.2 From 1a74357066369be91e6f4f431621a00b052df964 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Nov 2009 12:09:20 +0100 Subject: x86: ucode-amd: Convert printk(KERN_*...) to pr_*(...) Signed-off-by: Andreas Herrmann Cc: dimm LKML-Reference: <20091110110920.GJ30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 9f13324054ce..26e33bd8485b 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -78,12 +78,12 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); return -1; } rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -113,7 +113,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - printk(KERN_ERR "microcode: CPU%d: loading of chipset " + pr_err(KERN_ERR "microcode: CPU%d: loading of chipset " "specific code not yet supported\n", cpu); return 0; } @@ -143,14 +143,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - printk(KERN_ERR "microcode: CPU%d: update failed " + pr_err("microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); return -1; } - printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", - cpu, rev); - + pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -173,7 +171,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - printk(KERN_ERR "microcode: error: invalid type field in " + pr_err("microcode: error: invalid type field in " "container file section header\n"); return NULL; } @@ -181,7 +179,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error: size mismatch\n"); + pr_err("microcode: error: size mismatch\n"); return NULL; } @@ -210,15 +208,14 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - printk(KERN_ERR "microcode: error: invalid type field in " + pr_err("microcode: error: invalid type field in " "container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - printk(KERN_ERR "microcode: failed to allocate " - "equivalent CPU table\n"); + pr_err("microcode: failed to allocate equivalent CPU table\n"); return 0; } @@ -251,8 +248,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - printk(KERN_ERR "microcode: failed to create " - "equivalent cpu table\n"); + pr_err("microcode: failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -306,7 +302,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; if (*(u32 *)firmware->data != UCODE_MAGIC) { - printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)firmware->data); return UCODE_ERROR; } @@ -319,8 +315,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_INFO "microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); + pr_info("microcode: AMD microcode update via " + "/dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -336,7 +332,7 @@ void init_microcode_amd(struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; if (request_firmware(&firmware, fw_name, device)) - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); + pr_err("microcode: failed to load file %s\n", fw_name); } void fini_microcode_amd(void) -- cgit v1.2.2 From d07c1be0693e0902d743160b8b638585b808f8ac Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:12 +0900 Subject: x86: Add iommu_init to x86_init_ops We call the detections functions of all the IOMMUs then all their initialization functions. The latter is pointless since we don't detect multiple different IOMMUs. What we need to do is calling the initialization function of the detected IOMMU. This adds iommu_init hook to x86_init_ops so if an IOMMU detection function can set its initialization function to the hook. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/pci-dma.c | 2 ++ arch/x86/kernel/x86_init.c | 5 +++++ 3 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 66008ed80b7a..d8e71459f025 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -90,6 +90,14 @@ struct x86_init_timers { void (*timer_init)(void); }; +/** + * struct x86_init_iommu - platform specific iommu setup + * @iommu_init: platform specific iommu setup + */ +struct x86_init_iommu { + int (*iommu_init)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -101,6 +109,7 @@ struct x86_init_ops { struct x86_init_oem oem; struct x86_init_paging paging; struct x86_init_timers timers; + struct x86_init_iommu iommu; }; /** diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 839d49a669bc..a13478da533c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -292,6 +292,8 @@ static int __init pci_iommu_init(void) dma_debug_add_bus(&pci_bus_type); #endif + x86_init.iommu.iommu_init(); + calgary_iommu_init(); intel_iommu_init(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index bc9b230ef402..c46984d122dc 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -19,6 +19,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } +int __init iommu_init_noop(void) { return 0; } /* * The platform setup functions are preset with the default functions @@ -63,6 +64,10 @@ struct x86_init_ops x86_init __initdata = { .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { -- cgit v1.2.2 From d7b9f7be216b04ff9d108f856bc03d96e7b3439c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:13 +0900 Subject: x86: Calgary: Convert detect_calgary() to use iommu_init hook This changes detect_calgary() to set init_calgary() to iommu_init hook if detect_calgary() finds the Calgary IOMMU. We can kill the code to check if we found the IOMMU in init_calgary() since detect_calgary() sets init_calgary() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com LKML-Reference: <1257849980-22640-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/calgary.h | 2 -- arch/x86/kernel/pci-calgary_64.c | 11 +++++------ arch/x86/kernel/pci-dma.c | 2 -- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h index b03bedb62aa7..0918654305af 100644 --- a/arch/x86/include/asm/calgary.h +++ b/arch/x86/include/asm/calgary.h @@ -62,10 +62,8 @@ struct cal_chipset_ops { extern int use_calgary; #ifdef CONFIG_CALGARY_IOMMU -extern int calgary_iommu_init(void); extern void detect_calgary(void); #else -static inline int calgary_iommu_init(void) { return 1; } static inline void detect_calgary(void) { return; } #endif diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 971a3bec47a8..47bd419ea4d2 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -46,6 +46,7 @@ #include #include #include +#include #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; @@ -1344,6 +1345,8 @@ static void __init get_tce_space_from_tar(void) return; } +int __init calgary_iommu_init(void); + void __init detect_calgary(void) { int bus; @@ -1445,6 +1448,8 @@ void __init detect_calgary(void) /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) swiotlb = 1; + + x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1461,12 +1466,6 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || (swiotlb && !calgary_detected)) - return -ENODEV; - - if (!calgary_detected) - return -ENODEV; - /* ok, we're trying to use Calgary - let's roll */ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a13478da533c..0224da88256a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -294,8 +294,6 @@ static int __init pci_iommu_init(void) x86_init.iommu.iommu_init(); - calgary_iommu_init(); - intel_iommu_init(); amd_iommu_init(); -- cgit v1.2.2 From de957628ce7c84764ff41331111036b3ae5bad0f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:14 +0900 Subject: x86: GART: Convert gart_iommu_hole_init() to use iommu_init hook This changes gart_iommu_hole_init() to set gart_iommu_init() to iommu_init hook if gart_iommu_hole_init() finds the GART IOMMU. We can kill the code to check if we found the IOMMU in gart_iommu_init() since gart_iommu_hole_init() sets gart_iommu_init() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-4-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/gart.h | 5 +---- arch/x86/kernel/aperture_64.c | 2 ++ arch/x86/kernel/pci-dma.c | 2 -- arch/x86/kernel/pci-gart_64.c | 15 +++++---------- 4 files changed, 8 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 4fdd5b3f87b1..4ac5b0f33fc1 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -35,7 +35,7 @@ extern int gart_iommu_aperture_allowed; extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); +extern int gart_iommu_init(void); extern void __init gart_parse_options(char *); extern void gart_iommu_hole_init(void); @@ -47,9 +47,6 @@ extern void gart_iommu_hole_init(void); static inline void early_gart_iommu_check(void) { } -static inline void gart_iommu_init(void) -{ -} static inline void gart_parse_options(char *options) { } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 128111d8ffe0..03933cf0b63c 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -28,6 +28,7 @@ #include #include #include +#include int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; @@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void) iommu_detected = 1; gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0224da88256a..ecde8543537f 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -298,8 +298,6 @@ static int __init pci_iommu_init(void) amd_iommu_init(); - gart_iommu_init(); - no_iommu_init(); return 0; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index eb46ab3f52b2..0410bd30060d 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -709,7 +709,7 @@ static void gart_iommu_shutdown(void) } } -void __init gart_iommu_init(void) +int __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; @@ -719,7 +719,7 @@ void __init gart_iommu_init(void) long i; if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) - return; + return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; @@ -731,13 +731,6 @@ void __init gart_iommu_init(void) (agp_copy_info(agp_bridge, &info) < 0); #endif - if (swiotlb) - return; - - /* Did we detect a different HW IOMMU? */ - if (iommu_detected && !gart_iommu_aperture) - return; - if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || @@ -747,7 +740,7 @@ void __init gart_iommu_init(void) "but GART IOMMU not available.\n"); printk(KERN_WARNING "falling back to iommu=soft.\n"); } - return; + return 0; } /* need to map that range */ @@ -840,6 +833,8 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; x86_platform.iommu_shutdown = gart_iommu_shutdown; + + return 0; } void __init gart_parse_options(char *p) -- cgit v1.2.2 From ea1b0d3945c7374849235b6ecaea1191ee1d9d50 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:15 +0900 Subject: x86: amd_iommu: Convert amd_iommu_detect() to use iommu_init hook This changes amd_iommu_detect() to set amd_iommu_init to iommu_init hook if amd_iommu_detect() finds the AMD IOMMU. We can kill the code to check if we found the IOMMU in amd_iommu_init() since amd_iommu_detect() sets amd_iommu_init() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-5-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/amd_iommu.h | 2 -- arch/x86/kernel/amd_iommu_init.c | 17 +++-------------- arch/x86/kernel/pci-dma.c | 2 -- 3 files changed, 3 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 3604669f7b15..b8ef2ee93643 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -23,7 +23,6 @@ #include #ifdef CONFIG_AMD_IOMMU -extern int amd_iommu_init(void); extern int amd_iommu_init_dma_ops(void); extern int amd_iommu_init_passthrough(void); extern void amd_iommu_detect(void); @@ -32,7 +31,6 @@ extern void amd_iommu_flush_all_domains(void); extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_apply_erratum_63(u16 devid); #else -static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } #endif diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6acd43e9afd7..c41aabddaa2a 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * definitions for the ACPI scanning code @@ -1176,19 +1177,10 @@ static struct sys_device device_amd_iommu = { * functions. Finally it prints some information about AMD IOMMUs and * the driver state and enables the hardware. */ -int __init amd_iommu_init(void) +static int __init amd_iommu_init(void) { int i, ret = 0; - - if (no_iommu) { - printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); - return 0; - } - - if (!amd_iommu_detected) - return -ENODEV; - /* * First parse ACPI tables to find the largest Bus/Dev/Func * we need to handle. Upon this information the shared data @@ -1344,10 +1336,7 @@ void __init amd_iommu_detect(void) if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif + x86_init.iommu.iommu_init = amd_iommu_init; } } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index ecde8543537f..5ca44a9301a0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -296,8 +296,6 @@ static int __init pci_iommu_init(void) intel_iommu_init(); - amd_iommu_init(); - no_iommu_init(); return 0; } -- cgit v1.2.2 From 9d5ce73a64be2be8112147a3e0b551ad9cd1247b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:16 +0900 Subject: x86: intel-iommu: Convert detect_intel_iommu to use iommu_init hook This changes detect_intel_iommu() to set intel_iommu_init() to iommu_init hook if detect_intel_iommu() finds the IOMMU. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-6-git-send-email-fujita.tomonori@lab.ntt.co.jp> [ -v2: build fix for the !CONFIG_DMAR case ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5ca44a9301a0..bed05e2e5890 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -294,8 +294,6 @@ static int __init pci_iommu_init(void) x86_init.iommu.iommu_init(); - intel_iommu_init(); - no_iommu_init(); return 0; } -- cgit v1.2.2 From ad32e8cb86e7894aac51c8963eaa9f36bb8a4e14 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:19 +0900 Subject: swiotlb: Defer swiotlb init printing, export swiotlb_print_info() This enables us to avoid printing swiotlb memory info when we initialize swiotlb. After swiotlb initialization, we could find that we don't need swiotlb. This patch removes the code to print swiotlb memory info in swiotlb_init() and exports the function to do that. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com Cc: tony.luck@intel.com Cc: benh@kernel.crashing.org LKML-Reference: <1257849980-22640-9-git-send-email-fujita.tomonori@lab.ntt.co.jp> [ -v2: merge up conflict ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index aaa6b7839f1e..ea20ef7ca523 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -52,8 +52,7 @@ void __init pci_swiotlb_init(void) if (swiotlb_force) swiotlb = 1; if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_init(); + swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } } -- cgit v1.2.2 From 75f1cdf1dda92cae037ec848ae63690d91913eac Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:20 +0900 Subject: x86: Handle HW IOMMU initialization failure gracefully If HW IOMMU initialization fails (Intel VT-d often does this, typically due to BIOS bugs), we fall back to nommu. It doesn't work for the majority since nowadays we have more than 4GB memory so we must use swiotlb instead of nommu. The problem is that it's too late to initialize swiotlb when HW IOMMU initialization fails. We need to allocate swiotlb memory earlier from bootmem allocator. Chris explained the issue in detail: http://marc.info/?l=linux-kernel&m=125657444317079&w=2 The current x86 IOMMU initialization sequence is too complicated and handling the above issue makes it more hacky. This patch changes x86 IOMMU initialization sequence to handle the above issue cleanly. The new x86 IOMMU initialization sequence are: 1. we initialize the swiotlb (and setting swiotlb to 1) in the case of (max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to swiotlb_dma_ops or nommu_dma_ops. if swiotlb usage is forced by the boot option, we finish here. 2. we call the detection functions of all the IOMMUs 3. the detection function sets x86_init.iommu.iommu_init to the IOMMU initialization function (so we can avoid calling the initialization functions of all the IOMMUs needlessly). 4. if the IOMMU initialization function doesn't need to swiotlb then sets swiotlb to zero (e.g. the initialization is sucessful). 5. if we find that swiotlb is set to zero, we free swiotlb resource. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-10-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iommu.h | 1 - arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- arch/x86/kernel/aperture_64.c | 2 +- arch/x86/kernel/pci-calgary_64.c | 10 +--------- arch/x86/kernel/pci-dma.c | 21 +++++++++++++-------- arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/pci-nommu.c | 9 --------- arch/x86/kernel/pci-swiotlb.c | 7 +++---- 9 files changed, 21 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 878b30715766..df42a712361f 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -2,7 +2,6 @@ #define _ASM_X86_IOMMU_H static inline void iommu_shutdown_noop(void) {} -extern void no_iommu_init(void); extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0285521e0a99..66237fde758f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2110,8 +2110,8 @@ int __init amd_iommu_init_dma_ops(void) prealloc_protection_domains(); iommu_detected = 1; - force_iommu = 1; bad_dma_address = 0; + swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c41aabddaa2a..0d4581e602a4 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1330,7 +1330,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) + if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 03933cf0b63c..e0dfb6856aa2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -458,7 +458,7 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (swiotlb && !valid_agp) { + } else if (!valid_agp) { /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 47bd419ea4d2..833f491440b9 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1360,7 +1360,7 @@ void __init detect_calgary(void) * if the user specified iommu=off or iommu=soft or we found * another HW IOMMU already, bail out. */ - if (swiotlb || no_iommu || iommu_detected) + if (no_iommu || iommu_detected) return; if (!use_calgary) @@ -1445,10 +1445,6 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", specified_table_size); - /* swiotlb for devices that aren't behind the Calgary. */ - if (max_pfn > MAX_DMA32_PFN) - swiotlb = 1; - x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1476,11 +1472,7 @@ int __init calgary_iommu_init(void) return ret; } - force_iommu = 1; bad_dma_address = 0x0; - /* dma_ops is set to swiotlb or nommu */ - if (!dma_ops) - dma_ops = &nommu_dma_ops; return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index bed05e2e5890..a234e63c2656 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -124,24 +124,24 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { + /* swiotlb is forced by the boot option */ + int use_swiotlb = swiotlb; #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif + pci_swiotlb_init(); + if (use_swiotlb) + return; - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ gart_iommu_hole_init(); detect_calgary(); detect_intel_iommu(); + /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); - - pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, @@ -291,10 +291,15 @@ static int __init pci_iommu_init(void) #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); #endif - x86_init.iommu.iommu_init(); - no_iommu_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else + swiotlb_free(); + return 0; } /* Must execute after PCI subsystem */ diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0410bd30060d..919182e15d1e 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -833,6 +833,7 @@ int __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; return 0; } diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a3933d4330cd..875e3822ae61 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, }; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index ea20ef7ca523..17ce4221bd03 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -46,13 +46,12 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) + if (!no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif - if (swiotlb_force) - swiotlb = 1; if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; - } + } else + dma_ops = &nommu_dma_ops; } -- cgit v1.2.2 From 72d03802b8b5c841ab1da82bff0652628cbadf60 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 21:35:17 +0900 Subject: x86, 32-bit: Fix swiotlb boot crash Ingo Molnar reported this boot crash: [ 8.655620] pata_amd 0000:00:06.0: version 0.4.1 [ 8.660286] BUG: unable to handle kernel NULL pointer dereference at 00000034 [ 8.663572] IP: [] dma_supported+0x3b/0xa4 [ 8.663572] *pde = 00000000 Initialize dma_ops properly in the 32-bit case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a234e63c2656..63eebee80e75 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -129,6 +129,8 @@ void __init pci_iommu_alloc(void) #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); +#else + dma_ops = &nommu_dma_ops; #endif pci_swiotlb_init(); if (use_swiotlb) -- cgit v1.2.2 From b4941a9a606f0131559cc040b64e8437ac7b32c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 10 Nov 2009 14:37:58 +0100 Subject: x86: Add iommu_init to x86_init_ops, fix build Most of the time x86_init.h is included in pci-dma.c - but not always, leading to this rare build failure: arch/x86/kernel/pci-dma.c:296: error: 'x86_init' undeclared (first use in this function) So include asm/x86_init.h explicitly. Cc: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 63eebee80e75..f79870e89266 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,6 +11,7 @@ #include #include #include +#include static int forbid_dac __read_mostly; -- cgit v1.2.2 From 85160b92fbd35321104819283c91bfed2b553e3c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 10 Nov 2009 13:49:24 -0500 Subject: x86: Add new Intel CPU cache size descriptors The latest rev of Intel doc AP-485 details new cache descriptors that we don't yet support. 12MB, 18MB and 24MB 24-way assoc L3 caches. Signed-off-by: Dave Jones LKML-Reference: <20091110184924.GA20337@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 804c40e2bc3e..14103924b627 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; -- cgit v1.2.2 From e02e0e1a130b9ca37c5186d38ad4b3aaf58bb149 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 10 Nov 2009 15:01:20 -0500 Subject: x86: Fix typo in Intel CPU cache size descriptor I double-checked the datasheet. One of the existing descriptors has a typo: it should be 2MB not 2038 KB. Signed-off-by: Dave Jones Cc: # .3x.x: 85160b9: x86: Add new Intel CPU cache size descriptors Cc: # .3x.x LKML-Reference: <20091110200120.GA27090@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 14103924b627..8178d0352935 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ -- cgit v1.2.2 From 200a9ae2801bc725f2c41ab13f6e0fb1610d2fb6 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 10 Nov 2009 13:58:35 -0600 Subject: x86: Remove asm/apicnum.h arch/x86/include/asm/apicnum.h is not referenced anywhere anymore. Its definitions appear in apicdef.h. Remove it. Signed-off-by: Dimitri Sivanich Acked-by: Cyrill Gorcunov Acked-by: Mike Travis LKML-Reference: <20091110195835.GA4393@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicnum.h | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 arch/x86/include/asm/apicnum.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h deleted file mode 100644 index 82f613c607ce..000000000000 --- a/arch/x86/include/asm/apicnum.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_X86_APICNUM_H -#define _ASM_X86_APICNUM_H - -/* define MAX_IO_APICS */ -#ifdef CONFIG_X86_32 -# define MAX_IO_APICS 64 -#else -# define MAX_IO_APICS 128 -# define MAX_LOCAL_APIC 32768 -#endif - -#endif /* _ASM_X86_APICNUM_H */ -- cgit v1.2.2 From e84446de5cccd90de7d7ec46527d3b343b022a09 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Nov 2009 15:46:52 -0800 Subject: x86 VSDO: Fix Kconfig help COMPAT_VDSO has 2 help text blocks, but kconfig only uses the last one found, so merge the 2 blocks. It would be real nice if kconfig would warn about this. Signed-off-by: Randy Dunlap LKML-Reference: <4AF9FB6C.70003@oracle.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 72ace9515a07..618dedeb31fc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1602,7 +1602,7 @@ config COMPAT_VDSO depends on X86_32 || IA32_EMULATION ---help--- Map the 32-bit VDSO to the predictable old-style address too. - ---help--- + Say N here if you are running a sufficiently recent glibc version (2.3.3 or later), to remove the high-mapped VDSO mapping and to exclusively use the randomized VDSO. -- cgit v1.2.2 From ce6b5d768c79b9d5dd6345c033bae781d5ca9b8e Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 11 Nov 2009 15:51:25 +0800 Subject: x86: Mark the thermal init functions __init Mark the thermal init functions __init so that the init memory can be freed. Signed-off-by: Yong Wang LKML-Reference: <20091111075125.GA17900@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7f3cf36ed124..8a73d5c12a05 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -256,7 +256,7 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } -void mcheck_intel_therm_init(void) +void __init mcheck_intel_therm_init(void) { /* * This function is only called on boot CPU. Save the init thermal @@ -268,7 +268,7 @@ void mcheck_intel_therm_init(void) lvtthmr_init = apic_read(APIC_LVTTHMR); } -void intel_init_thermal(struct cpuinfo_x86 *c) +void __init intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); int tm2 = 0; -- cgit v1.2.2 From b18485e7acfe1a634615d1c628ef644c0d58d472 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 12 Nov 2009 00:03:28 +0900 Subject: swiotlb: Remove the swiotlb variable usage POWERPC doesn't expect it to be used. This fixes the linux-next build failure reported by Stephen Rothwell: lib/swiotlb.c: In function 'setup_io_tlb_npages': lib/swiotlb.c:114: error: 'swiotlb' undeclared (first use in this function) Reported-by: Stephen Rothwell Signed-off-by: FUJITA Tomonori Cc: peterz@infradead.org LKML-Reference: <20091112000258F.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/swiotlb.h | 5 +++-- arch/x86/kernel/pci-dma.c | 5 +---- arch/x86/kernel/pci-swiotlb.c | 13 ++++++++++++- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index b9e4e20174fb..940f13a213f8 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -9,11 +9,12 @@ extern int swiotlb_force; #ifdef CONFIG_SWIOTLB extern int swiotlb; -extern void pci_swiotlb_init(void); +extern int pci_swiotlb_init(void); #else #define swiotlb 0 -static inline void pci_swiotlb_init(void) +static inline int pci_swiotlb_init(void) { + return 0; } #endif diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f79870e89266..0b11bf18f540 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -125,16 +125,13 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { - /* swiotlb is forced by the boot option */ - int use_swiotlb = swiotlb; #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #else dma_ops = &nommu_dma_ops; #endif - pci_swiotlb_init(); - if (use_swiotlb) + if (pci_swiotlb_init()) return; gart_iommu_hole_init(); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 17ce4221bd03..a6e5d0ffa3a7 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -42,16 +42,27 @@ static struct dma_map_ops swiotlb_dma_ops = { .dma_supported = NULL, }; -void __init pci_swiotlb_init(void) +/* + * pci_swiotlb_init - initialize swiotlb if necessary + * + * This returns non-zero if we are forced to use swiotlb (by the boot + * option). + */ +int __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 if (!no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif + if (swiotlb_force) + swiotlb = 1; + if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } else dma_ops = &nommu_dma_ops; + + return swiotlb_force; } -- cgit v1.2.2 From 9f15226e75583547aaf542c6be4bdac1060dd425 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 11 Nov 2009 20:03:29 +0100 Subject: x86, ucode-amd: Ensure ucode update on suspend/resume after CPU off/online cycle When switching a CPU offline/online and then doing suspend/resume, ucode is not updated on this CPU. This is due to the microcode_fini_cpu() call which frees uci->mc when setting the CPU offline: static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; vfree(uci->mc); uci->mc = NULL; } When the CPU is set online uci->mc is still NULL because no ucode update is required. Finally this prevents ucode update when resuming after suspend: static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!uci->mc) return UCODE_NFOUND; ... } Fix is to check whether uci->mc is valid before microcode_resume_cpu() is called. Signed-off-by: Andreas Herrmann Cc: dimm LKML-Reference: <20091111190329.GF18592@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index d2a816021d9f..adf234061540 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -393,7 +393,7 @@ static enum ucode_state microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; enum ucode_state ustate; - if (uci->valid) + if (uci->valid && uci->mc) ustate = microcode_resume_cpu(cpu); else ustate = microcode_init_cpu(cpu); -- cgit v1.2.2 From 196cf0d67acad70ebb2572da489d5cc7066cdd05 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 10 Nov 2009 18:27:23 -0800 Subject: x86: Make sure wakeup trampoline code is below 1MB Instead of using bootmem, try find_e820_area()/reserve_early(), and call acpi_reserve_memory() early, to allocate the wakeup trampoline code area below 1M. This is more reliable, and it also removes a dependency on bootmem. -v2: change function name to acpi_reserve_wakeup_memory(), as suggested by Rafael. Signed-off-by: Yinghai Lu Acked-by: H. Peter Anvin Acked-by: Rafael J. Wysocki Cc: pm list Cc: Len Brown Cc: Linus Torvalds LKML-Reference: <4AFA210B.3020207@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/acpi.h | 2 +- arch/x86/kernel/acpi/sleep.c | 15 +++++++++------ arch/x86/kernel/setup.c | 13 +++++++------ 3 files changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index e3d4a0daff57..60d2b2db0bc5 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void); extern unsigned long acpi_wakeup_address; /* early initialization routine */ -extern void acpi_reserve_bootmem(void); +extern void acpi_reserve_wakeup_memory(void); /* * Check if the CPU can handle C2 and deeper diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index ca93638ba430..4a411450dfa0 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -119,29 +119,32 @@ void acpi_restore_state_mem(void) /** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation * * We allocate a page from the first 1MB of memory for the wakeup * routine for when we come back from a sleep state. The * runtime allocator allows specification of <16MB pages, but not * <1MB pages. */ -void __init acpi_reserve_bootmem(void) +void __init acpi_reserve_wakeup_memory(void) { + unsigned long mem; + if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { printk(KERN_ERR "ACPI: Wakeup code way too big, S3 disabled.\n"); return; } - acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); + mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - if (!acpi_realmode) { + if (mem == -1L) { printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); return; } - - acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); + acpi_realmode = (unsigned long) phys_to_virt(mem); + acpi_wakeup_address = mem; + reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f89141982702..0a6e94ab8339 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -897,6 +897,13 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + * even before init_memory_mapping + */ + acpi_reserve_wakeup_memory(); +#endif init_gbpages(); /* max_pfn_mapped is updated here */ @@ -948,12 +955,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn, acpi, k8); -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif /* * Find and reserve possible boot-time SMP configuration: */ -- cgit v1.2.2 From cffd377e5879ea58522224a785a083f201afd80e Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 12 Nov 2009 15:52:40 +0900 Subject: x86, mce: Fix __init annotations The intel_init_thermal() is called from resume path, so it cannot be marked as __init. OTOH mce_banks_init() is only called from __mcheck_cpu_cap_init() which is marked as __cpuinit, so it can be also marked as __cpuinit. Signed-off-by: Hidetoshi Seto Acked-by: Yong Wang LKML-Reference: <4AFBB0B8.2070501@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0d4102031a4c..5f277cad2ed7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1201,7 +1201,7 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int mce_banks_init(void) +static int __cpuinit __mcheck_cpu_mce_banks_init(void) { int i; @@ -1242,7 +1242,7 @@ static int __cpuinit __mcheck_cpu_cap_init(void) WARN_ON(banks != 0 && b != banks); banks = b; if (!mce_banks) { - int err = mce_banks_init(); + int err = __mcheck_cpu_mce_banks_init(); if (err) return err; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 8a73d5c12a05..4fef985fc221 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -268,7 +268,7 @@ void __init mcheck_intel_therm_init(void) lvtthmr_init = apic_read(APIC_LVTTHMR); } -void __init intel_init_thermal(struct cpuinfo_x86 *c) +void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); int tm2 = 0; -- cgit v1.2.2 From db48cccc7c709ccfa7cb4ac702bc27c216bffee7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 12 Nov 2009 11:25:34 +0900 Subject: perf_event, x86: Annotate init functions and data Annotate init functions and data with __init and __initconst. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <4AFB721E.8070203@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2e20bca3cca1..bd8743024204 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -245,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static const u64 nehalem_hw_cache_event_ids +static __initconst u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -336,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids }, }; -static const u64 core2_hw_cache_event_ids +static __initconst u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -427,7 +427,7 @@ static const u64 core2_hw_cache_event_ids }, }; -static const u64 atom_hw_cache_event_ids +static __initconst u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -536,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event) return hw_event & CORE_EVNTSEL_MASK; } -static const u64 amd_hw_cache_event_ids +static __initconst u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -1964,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { .priority = 1 }; -static struct x86_pmu p6_pmu = { +static __initconst struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = p6_pmu_handle_irq, .disable_all = p6_pmu_disable_all, @@ -1992,7 +1992,7 @@ static struct x86_pmu p6_pmu = { .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu intel_pmu = { +static __initconst struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, @@ -2016,7 +2016,7 @@ static struct x86_pmu intel_pmu = { .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu amd_pmu = { +static __initconst struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, @@ -2037,7 +2037,7 @@ static struct x86_pmu amd_pmu = { .get_event_idx = gen_get_event_idx, }; -static int p6_pmu_init(void) +static __init int p6_pmu_init(void) { switch (boot_cpu_data.x86_model) { case 1: @@ -2071,7 +2071,7 @@ static int p6_pmu_init(void) return 0; } -static int intel_pmu_init(void) +static __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -2144,7 +2144,7 @@ static int intel_pmu_init(void) return 0; } -static int amd_pmu_init(void) +static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) -- cgit v1.2.2 From 24a065624dcdd91e8bfd0f14113feb91c7ed11ca Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Apr 2009 05:33:18 -0700 Subject: sysctl x86: Remove dead binary sysctl support Now that sys_sysctl is a generic wrapper around /proc/sys .ctl_name and .strategy members of sysctl tables are dead code. Remove them. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: Eric W. Biederman --- arch/x86/kernel/vsyscall_64.c | 2 +- arch/x86/vdso/vdso32-setup.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8cb4974ff599..e02d92d12bcd 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -237,7 +237,7 @@ static ctl_table kernel_table2[] = { }; static ctl_table kernel_root_table2[] = { - { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, + { .procname = "kernel", .mode = 0555, .child = kernel_table2 }, {} }; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 58bc00f68b12..02b442e92007 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -393,7 +393,6 @@ static ctl_table abi_table2[] = { static ctl_table abi_root_table2[] = { { - .ctl_name = CTL_ABI, .procname = "abi", .mode = 0555, .child = abi_table2 -- cgit v1.2.2 From 15cd8812ab2ce62a2f779e93a8398bdad752291a Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 12 Nov 2009 18:15:43 -0500 Subject: x86: Remove the CPU cache size printk's They aren't really useful, and they pollute the dmesg output a lot (especially on machines with many cores). Also the same information can be trivially found out from userspace. Reported-by: Mike Travis Signed-off-by: Dave Jones Acked-by: H. Peter Anvin Cc: Andi Kleen Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091112231542.GA7129@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 804c40e2bc3e..0df4c2b7107f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -488,22 +488,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) #endif } - if (trace) - printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if (l1i) - printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); - - if (l1d) - printk(KERN_CONT ", L1 D cache: %dK\n", l1d); - else - printk(KERN_CONT "\n"); - - if (l2) - printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); - - if (l3) - printk(KERN_INFO "CPU: L3 cache: %dK\n", l3); - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); return l2; -- cgit v1.2.2 From d9b263528e01bfbaf716b51f38606b3dfe5ac1e9 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 13 Nov 2009 14:57:00 -0500 Subject: x86, setup: Store the boot cursor state Add a field to store the boot cursor state and implement this for VGA on x86. This can then be used to set the default policy for the boot console. Signed-off-by: Matthew Garrett LKML-Reference: <1258142222-16092-1-git-send-email-mjg@redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/boot/video.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index d42da3802499..f767164cd5df 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -27,6 +27,12 @@ static void store_cursor_position(void) boot_params.screen_info.orig_x = oreg.dl; boot_params.screen_info.orig_y = oreg.dh; + + if (oreg.ch & 0x20) + boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR; + + if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f)) + boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR; } static void store_video_mode(void) -- cgit v1.2.2 From 0388423dba2217b4e5b6c61690b0506d13b25a49 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 13 Nov 2009 15:30:00 -0500 Subject: x86: Minimise printk spew from per-vendor init code In the default case where the kernel supports all CPU vendors, we currently print out a bunch of not useful messages on every system. 32-bit: KERNEL supported cpus: Intel GenuineIntel AMD AuthenticAMD NSC Geode by NSC Cyrix CyrixInstead Centaur CentaurHauls Transmeta GenuineTMx86 Transmeta TransmetaCPU UMC UMC UMC UMC 64-bit: KERNEL supported cpus: Intel GenuineIntel AMD AuthenticAMD Centaur CentaurHauls Given that "what CPUs does the kernel support" isn't useful for the "support everything" case, we can suppress these printk's. Signed-off-by: Dave Jones LKML-Reference: <20091113203000.GA19160@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..617a29f95b3c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -656,6 +656,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) void __init early_cpu_init(void) { +#ifdef PROCESSOR_SELECT const struct cpu_dev *const *cdev; int count = 0; @@ -676,7 +677,7 @@ void __init early_cpu_init(void) cpudev->c_ident[j]); } } - +#endif early_identify_cpu(&boot_cpu_data); } -- cgit v1.2.2 From b01c845f0f2e3f9e54e6a78d5d56895f5b95e818 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 13 Nov 2009 14:38:26 -0800 Subject: x86: Remove CPU cache size output for non-Intel too As Dave Jones said about the output in intel_cacheinfo.c: "They aren't useful, and pollute the dmesg output a lot (especially on machines with many cores). Also the same information can be trivially found out from userspace." Give the generic display_cacheinfo() function the same treatment. Signed-off-by: Roland Dreier Acked-by: Dave Jones Cc: Mike Travis Cc: Andi Kleen Cc: Heiko Carstens Cc: Randy Dunlap Cc: Tejun Heo Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 617a29f95b3c..9db1e2425c27 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size = (ecx>>24) + (edx>>24); #ifdef CONFIG_X86_64 /* On K8 L1 TLB is inclusive, so don't count it */ @@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) #endif c->x86_cache_size = l2size; - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); } void __cpuinit detect_ht(struct cpuinfo_x86 *c) -- cgit v1.2.2 From 31c997cac76e62918858a432fff6e43fd48425f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 14 Nov 2009 10:34:41 +0100 Subject: x86: Fix cpu_devs[] initialization in early_cpu_init() Yinghai Lu noticed that this commit: 0388423: x86: Minimise printk spew from per-vendor init code mistakenly left out the initialization of cpu_devs[] in the !PROCESSOR_SELECT case. Fix it. Reported-by: Yinghai Lu Cc: Dave Jones LKML-Reference: <20091113203000.GA19160@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9db1e2425c27..61242a56c2d6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -651,28 +651,34 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) void __init early_cpu_init(void) { -#ifdef PROCESSOR_SELECT const struct cpu_dev *const *cdev; int count = 0; +#ifdef PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); +#endif + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; - unsigned int j; if (count >= X86_VENDOR_NUM) break; cpu_devs[count] = cpudev; count++; - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - printk(KERN_INFO " %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); +#ifdef PROCESSOR_SELECT + { + unsigned int j; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } } - } #endif + } early_identify_cpu(&boot_cpu_data); } -- cgit v1.2.2 From 68efa37df779b3e04280598e8b5b3a1919b65fee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 14 Nov 2009 01:35:29 +0100 Subject: hw-breakpoints, x86: Fix modular KVM build This build error: arch/x86/kvm/x86.c:3655: error: implicit declaration of function 'hw_breakpoint_restore' Happens because in the CONFIG_KVM=m case there's no 'CONFIG_KVM' define in the kernel - it's CONFIG_KVM_MODULE in that case. Make the prototype available unconditionally. Cc: Frederic Weisbecker Cc: Prasad LKML-Reference: <1258114575-32655-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/debugreg.h | 2 -- arch/x86/kernel/hw_breakpoint.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 0f6e92af4227..fdabd8435765 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -96,9 +96,7 @@ static inline int hw_breakpoint_active(void) extern void aout_dump_debugregs(struct user *dump); -#ifdef CONFIG_KVM extern void hw_breakpoint_restore(void); -#endif #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 57dcee5fa958..752daebe91c6 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -43,6 +43,7 @@ /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, dr7); +EXPORT_PER_CPU_SYMBOL(dr7); /* Per cpu debug address registers values */ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); @@ -409,6 +410,7 @@ void aout_dump_debugregs(struct user *dump) dump->u_debugreg[7] = dr7; } +EXPORT_SYMBOL_GPL(aout_dump_debugregs); /* * Release the user breakpoints used by ptrace @@ -424,7 +426,6 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) } } -#ifdef CONFIG_KVM void hw_breakpoint_restore(void) { set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); @@ -435,7 +436,6 @@ void hw_breakpoint_restore(void) set_debugreg(__get_cpu_var(dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); -#endif /* * Handle debug exception notifications. -- cgit v1.2.2 From a3b28ee1090072092e2be043c24df94230e725b2 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 14 Nov 2009 20:46:36 +0900 Subject: x86: Set dma_ops to nommu_dma_ops by default We set dma_ops to nommu_dma_ops at two different places for x86_32 and x86_64. This unifies them by setting dma_ops to nommu_dma_ops by default. Signed-off-by: FUJITA Tomonori LKML-Reference: <1258199198-16657-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 4 +--- arch/x86/kernel/pci-swiotlb.c | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0b11bf18f540..f170b5364b41 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -15,7 +15,7 @@ static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops; +struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -128,8 +128,6 @@ void __init pci_iommu_alloc(void) #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); -#else - dma_ops = &nommu_dma_ops; #endif if (pci_swiotlb_init()) return; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index a6e5d0ffa3a7..e36e71daa44c 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -61,8 +61,7 @@ int __init pci_swiotlb_init(void) if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; - } else - dma_ops = &nommu_dma_ops; + } return swiotlb_force; } -- cgit v1.2.2 From 94a15564ac63af6bb2ff8d4d04f86d5e7ee0278a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 14 Nov 2009 20:46:37 +0900 Subject: x86: Move iommu_shutdown_noop to x86_init.c iommu_init_noop() is in arch/x86/kernel/x86_init.c but iommu_shutdown_noop() in arch/x86/include/asm/iommu.h. This moves iommu_shutdown_noop() to x86_init.c for consistency. Signed-off-by: FUJITA Tomonori LKML-Reference: <1258199198-16657-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iommu.h | 1 - arch/x86/kernel/x86_init.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index df42a712361f..345c99cef152 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_IOMMU_H #define _ASM_X86_IOMMU_H -static inline void iommu_shutdown_noop(void) {} extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c46984d122dc..80f3ae24b974 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -20,6 +20,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } +void __init iommu_shutdown_noop(void) { } /* * The platform setup functions are preset with the default functions -- cgit v1.2.2 From 6959450e567c1f17d3ce8489099fc56c3721d577 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 14 Nov 2009 20:46:38 +0900 Subject: swiotlb: Remove duplicate swiotlb_force extern declarations Signed-off-by: FUJITA Tomonori Cc: tony.luck@intel.com LKML-Reference: <1258199198-16657-4-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/swiotlb.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 940f13a213f8..87ffcb12a1b8 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -3,10 +3,6 @@ #include -/* SWIOTLB interface */ - -extern int swiotlb_force; - #ifdef CONFIG_SWIOTLB extern int swiotlb; extern int pci_swiotlb_init(void); -- cgit v1.2.2 From f4131c6259b46bd84dcfcd3bb9ed08e99e2875a4 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 14 Nov 2009 21:26:50 +0900 Subject: x86: Make calgary_iommu_init() static This makes calgary_iommu_init() static and moves it to remove the forward declaration. Signed-off-by: FUJITA Tomonori Cc: muli@il.ibm.com LKML-Reference: <20091114212603U.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 833f491440b9..c84ad037f586 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1345,7 +1345,24 @@ static void __init get_tce_space_from_tar(void) return; } -int __init calgary_iommu_init(void); +static int __init calgary_iommu_init(void) +{ + int ret; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + return ret; + } + + bad_dma_address = 0x0; + + return 0; +} void __init detect_calgary(void) { @@ -1458,25 +1475,6 @@ cleanup: } } -int __init calgary_iommu_init(void) -{ - int ret; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - bad_dma_address = 0x0; - - return 0; -} - static int __init calgary_parse_options(char *p) { unsigned int bridge; -- cgit v1.2.2 From 14722485830fe6baba738b91d96f06fbd6cf7a18 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 13 Nov 2009 11:56:24 +0000 Subject: x86-64: __copy_from_user_inatomic() adjustments This v2.6.26 commit: ad2fc2c: x86: fix copy_user on x86 rendered __copy_from_user_inatomic() identical to copy_user_generic(), yet didn't make the former just call the latter from an inline function. Furthermore, this v2.6.19 commit: b885808: [PATCH] Add proper sparse __user casts to __copy_to_user_inatomic converted the return type of __copy_to_user_inatomic() from unsigned long to int, but didn't do the same to __copy_from_user_inatomic(). Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Alexander Viro Cc: Arjan van de Ven Cc: Andi Kleen Cc: LKML-Reference: <4AFD5778020000780001F8F4@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess_64.h | 7 +++++-- arch/x86/kernel/x8664_ksyms_64.c | 1 - arch/x86/lib/copy_user_64.S | 6 ------ 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index ce6fec7ce38d..7adebacaa325 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -193,8 +193,11 @@ __must_check long strlen_user(const char __user *str); __must_check unsigned long clear_user(void __user *mem, unsigned long len); __must_check unsigned long __clear_user(void __user *mem, unsigned long len); -__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, - unsigned size); +static __must_check __always_inline int +__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) +{ + return copy_user_generic(dst, (__force const void *)src, size); +} static __must_check __always_inline int __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a0cdd8cc1d67..cd54276b6be8 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -32,7 +32,6 @@ EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 4be3c415b3e9..39369985f1cb 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -96,12 +96,6 @@ ENTRY(copy_user_generic) CFI_ENDPROC ENDPROC(copy_user_generic) -ENTRY(__copy_from_user_inatomic) - CFI_STARTPROC - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(__copy_from_user_inatomic) - .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) -- cgit v1.2.2 From dc186ad741c12ae9ecac8b89e317ef706fdaf8f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2009 01:09:48 +0900 Subject: workqueue: Add debugobjects support Add debugobject support to track the life time of work_structs. While at it, remove duplicate definition of INIT_DELAYED_WORK_ON_STACK(). Signed-off-by: Thomas Gleixner Signed-off-by: Tejun Heo --- arch/x86/kernel/smpboot.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 565ebc65920e..ba43dfed353d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -687,7 +687,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK(&c_idle.work, do_fork_idle); + INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -713,6 +713,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); + destroy_work_on_stack(&c_idle.work); return PTR_ERR(c_idle.idle); } @@ -831,6 +832,7 @@ do_rest: smpboot_restore_warm_reset_vector(); } + destroy_work_on_stack(&c_idle.work); return boot_error; } -- cgit v1.2.2 From 62ad33f67003b9a7b6013f0511579b9805e11626 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 16 Nov 2009 11:44:30 +0900 Subject: x86: Don't put iommu_shutdown_noop() in init section It causes kernel panic on shutdown or reboot. Signed-off-by: Hiroshi Shimamoto Acked-by: FUJITA Tomonori LKML-Reference: <4B00BC8E.50801@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/x86_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 80f3ae24b974..d11c5ff7c65e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -20,7 +20,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } -void __init iommu_shutdown_noop(void) { } +void iommu_shutdown_noop(void) { } /* * The platform setup functions are preset with the default functions -- cgit v1.2.2 From 411462f62a65eeae7f451c6eb7a38b9d8759c61a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2009 11:52:39 +0100 Subject: x86: Fix printk format due to variable type change clockevents.mult became u32. Fix the printk format. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 894aa97f0717..cf4ee5195c5e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -662,7 +662,7 @@ static int __init calibrate_APIC_clock(void) calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", calibration_result); -- cgit v1.2.2 From 303fc0870f8fbfabe260c5c32b18e53458d597ea Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 12 Nov 2009 13:09:31 -0500 Subject: x86: AMD Northbridge: Verify NB's node is online Fix panic seen on some IBM and HP systems on 2.6.32-rc6: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] find_next_bit+0x77/0x9c [...] [] cpumask_next_and+0x2e/0x3b [] pci_device_probe+0x8e/0xf5 [] ? driver_sysfs_add+0x47/0x6c [] driver_probe_device+0xd9/0x1f9 [] __driver_attach+0x58/0x7c [] ? __driver_attach+0x0/0x7c [] bus_for_each_dev+0x54/0x89 [] driver_attach+0x19/0x1b [] bus_add_driver+0xd3/0x23d [] driver_register+0x98/0x109 [] __pci_register_driver+0x63/0xd3 [] ? up_read+0x26/0x2a [] ? k8temp_init+0x0/0x20 [k8temp] [] k8temp_init+0x1e/0x20 [k8temp] [] do_one_initcall+0x6d/0x185 [] sys_init_module+0xd3/0x236 [] system_call_fastpath+0x16/0x1b I put in a printk and commented out the set_dev_node() call when and got this output: quirk_amd_nb_node: current numa_node = 0x0, would set to val & 7 = 0x0 quirk_amd_nb_node: current numa_node = 0x0, would set to val & 7 = 0x1 quirk_amd_nb_node: current numa_node = 0x0, would set to val & 7 = 0x2 quirk_amd_nb_node: current numa_node = 0x0, would set to val & 7 = 0x3 I.e. the issue appears to be that the HW has set val to a valid value, however, the system is only configured for a single node -- 0, the others are offline. Check to see if the node is actually online before setting the numa node for an AMD northbridge in quirk_amd_nb_node(). Signed-off-by: Prarit Bhargava Cc: bhavna.sarathy@amd.com Cc: jbarnes@virtuousgeek.org Cc: andreas.herrmann3@amd.com LKML-Reference: <20091112180933.12532.98685.sendpatchset@prarit.bos.redhat.com> [ v2: clean up the code and add comments ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 6c3b2c6fd772..18093d7498f0 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -499,6 +499,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) { struct pci_dev *nb_ht; unsigned int devfn; + u32 node; u32 val; devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); @@ -507,7 +508,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) return; pci_read_config_dword(nb_ht, 0x60, &val); - set_dev_node(&dev->dev, val & 7); + node = val & 7; + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); pci_dev_put(nb_ht); } -- cgit v1.2.2 From 3c93ca00eeeb774c7dd666cc7286a9e90c53e998 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Nov 2009 15:42:18 +0100 Subject: x86: Add missing might_fault() checks to copy_{to,from}_user() On x86-64, copy_[to|from]_user() rely on assembly routines that never call might_fault(), making us missing various lockdep checks. This doesn't apply to __copy_from,to_user() that explicitly handle these calls, neither is it a problem in x86-32 where copy_to,from_user() rely on the "__" prefixed versions that also call might_fault(). Signed-off-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Linus Torvalds Cc: Nick Piggin Cc: Peter Zijlstra LKML-Reference: <1258382538-30979-1-git-send-email-fweisbec@gmail.com> [ v2: fix module export ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess_64.h | 10 +++++++++- arch/x86/kernel/x8664_ksyms_64.c | 2 +- arch/x86/lib/copy_user_64.S | 4 ++-- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 7adebacaa325..46324c6a4f6e 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -19,7 +19,7 @@ __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len); __must_check unsigned long -copy_to_user(void __user *to, const void *from, unsigned len); +_copy_to_user(void __user *to, const void *from, unsigned len); __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned len); __must_check unsigned long @@ -32,6 +32,7 @@ static inline unsigned long __must_check copy_from_user(void *to, int sz = __compiletime_object_size(to); int ret = -EFAULT; + might_fault(); if (likely(sz == -1 || sz >= n)) ret = _copy_from_user(to, from, n); #ifdef CONFIG_DEBUG_VM @@ -41,6 +42,13 @@ static inline unsigned long __must_check copy_from_user(void *to, return ret; } +static __always_inline __must_check +int copy_to_user(void __user *dst, const void *src, unsigned size) +{ + might_fault(); + + return _copy_to_user(dst, src, size); +} static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index cd54276b6be8..a1029769b6f2 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); -EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(_copy_to_user); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 39369985f1cb..cf889d4e076a 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -65,7 +65,7 @@ .endm /* Standard copy_to_user with segment limit checking */ -ENTRY(copy_to_user) +ENTRY(_copy_to_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx @@ -75,7 +75,7 @@ ENTRY(copy_to_user) jae bad_to_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC -ENDPROC(copy_to_user) +ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(_copy_from_user) -- cgit v1.2.2 From e79c65a97c01d5da4317f44f9f98b3814e091a43 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 16 Nov 2009 18:14:26 +0300 Subject: x86: io-apic: IO-APIC MMIO should not fail on resource insertion If IO-APIC base address is 1K aligned we should not fail on resourse insertion procedure. For this sake we define IO_APIC_SLOT_SIZE constant which should cover all IO-APIC direct accessible registers. An example of a such configuration is there http://marc.info/?l=linux-kernel&m=118114792006520 | | Quoting the message | | IOAPIC[0]: apic_id 2, version 32, address 0xfec00000, GSI 0-23 | IOAPIC[1]: apic_id 3, version 32, address 0xfec80000, GSI 24-47 | IOAPIC[2]: apic_id 4, version 32, address 0xfec80400, GSI 48-71 | IOAPIC[3]: apic_id 5, version 32, address 0xfec84000, GSI 72-95 | IOAPIC[4]: apic_id 8, version 32, address 0xfec84400, GSI 96-119 | Reported-by: "Maciej W. Rozycki" Signed-off-by: Cyrill Gorcunov Acked-by: Yinghai Lu LKML-Reference: <20091116151426.GC5653@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicdef.h | 6 ++++++ arch/x86/kernel/apic/io_apic.c | 11 +++++------ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 3b62da926de9..7fe3b3060f08 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -11,6 +11,12 @@ #define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 #define APIC_DEFAULT_PHYS_BASE 0xfee00000 +/* + * This is the IO-APIC register space as specified + * by Intel docs: + */ +#define IO_APIC_SLOT_SIZE 1024 + #define APIC_ID 0x20 #define APIC_LVR 0x30 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 20ea8392bc57..ff237199fa23 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4100,18 +4100,17 @@ void __init ioapic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + PAGE_SIZE-1; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } } -- cgit v1.2.2 From 8a50e5135af0c243e117e94e27feb8d149c879b4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 13 Nov 2009 15:28:13 -0800 Subject: x86-32: Use symbolic constants, safer CPUID when enabling EFER.NX Use symbolic constants rather than hard-coded values when setting EFER.NX in head_32.S, and do a more rigorous test for the validity of the response when probing for the extended CPUID range. Signed-off-by: H. Peter Anvin LKML-Reference: <1258154897-6770-2-git-send-email-hpa@zytor.com> Acked-by: Kees Cook --- arch/x86/kernel/head_32.S | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 050c278481b1..7fd318bac59c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include /* Physical address */ @@ -297,25 +299,27 @@ ENTRY(startup_32_smp) orl %edx,%eax movl %eax,%cr4 - btl $5, %eax # check if PAE is enabled - jnc 6f + testb $X86_CR4_PAE, %al # check if PAE is enabled + jz 6f /* Check if extended functions are implemented */ movl $0x80000000, %eax cpuid - cmpl $0x80000000, %eax - jbe 6f + /* Value must be in the range 0x80000001 to 0x8000ffff */ + subl $0x80000001, %eax + cmpl $(0x8000ffff-0x80000001), %eax + ja 6f mov $0x80000001, %eax cpuid /* Execute Disable bit supported? */ - btl $20, %edx + btl $(X86_FEATURE_NX & 31), %edx jnc 6f /* Setup EFER (Extended Feature Enable Register) */ - movl $0xc0000080, %ecx + movl $MSR_EFER, %ecx rdmsr - btsl $11, %eax + btsl $_EFER_NX, %eax /* Make changes effective */ wrmsr -- cgit v1.2.2 From a7c4c0d934c6cbc58de262d090d4a715445453f0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 13 Nov 2009 15:28:14 -0800 Subject: x86, sleep: Always save the value of EFER Always save the value of EFER, regardless of the state of NX. Since EFER may not actually exist, use rdmsr_safe() to do so. v2: check the return value from rdmsr_safe() instead of relying on the output values being unchanged on error. Signed-off-by: H. Peter Anvin Acked-by: Rafael J. Wysocki Cc: Pavel Machek Cc: Nigel Cunningham LKML-Reference: <1258154897-6770-3-git-send-email-hpa@zytor.com> Acked-by: Kees Cook --- arch/x86/kernel/acpi/sleep.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 4a411450dfa0..82e508677b91 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -78,12 +78,9 @@ int acpi_save_state_mem(void) #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); - header->pmode_efer_low = nx_enabled; - if (header->pmode_efer_low & 1) { - /* This is strange, why not save efer, always? */ - rdmsr(MSR_EFER, header->pmode_efer_low, - header->pmode_efer_high); - } + if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low, + &header->pmode_efer_high)) + header->pmode_efer_low = header->pmode_efer_high = 0; #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); -- cgit v1.2.2 From 583140afb989f24d115e80be5c91e503b58ccfc0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 13 Nov 2009 15:28:15 -0800 Subject: x86, pageattr: Make set_memory_(x|nx) aware of NX support Make set_memory_x/set_memory_nx directly aware of if NX is supported in the system or not, rather than requiring that every caller assesses that support independently. Signed-off-by: H. Peter Anvin Cc: Huang Ying Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Tejun Heo Cc: Tim Starling Cc: Hannes Eder LKML-Reference: <1258154897-6770-4-git-send-email-hpa@zytor.com> Acked-by: Kees Cook --- arch/x86/kernel/machine_kexec_32.c | 6 ++---- arch/x86/mm/pageattr.c | 6 ++++++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d00130..03657e784fd8 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -157,8 +157,7 @@ int machine_kexec_prepare(struct kimage *image) { int error; - if (nx_enabled) - set_pages_x(image->control_code_page, 1); + set_pages_x(image->control_code_page, 1); error = machine_kexec_alloc_page_tables(image); if (error) return error; @@ -172,8 +171,7 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { - if (nx_enabled) - set_pages_nx(image->control_code_page, 1); + set_pages_nx(image->control_code_page, 1); machine_kexec_free_page_tables(image); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 09a140ca7be8..1d4eb93d333c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1085,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb); int set_memory_x(unsigned long addr, int numpages) { + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); -- cgit v1.2.2 From 4763ed4d45522b876c97e1f7f4b659d211f75571 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 13 Nov 2009 15:28:16 -0800 Subject: x86, mm: Clean up and simplify NX enablement The 32- and 64-bit code used very different mechanisms for enabling NX, but even the 32-bit code was enabling NX in head_32.S if it is available. Furthermore, we had a bewildering collection of tests for the available of NX. This patch: a) merges the 32-bit set_nx() and the 64-bit check_efer() function into a single x86_configure_nx() function. EFER control is left to the head code. b) eliminates the nx_enabled variable entirely. Things that need to test for NX enablement can verify __supported_pte_mask directly, and cpu_has_nx gives the supported status of NX. Signed-off-by: H. Peter Anvin Cc: Tejun Heo Cc: Brian Gerst Cc: Yinghai Lu Cc: Pekka Enberg Cc: Vegard Nossum Cc: Jeremy Fitzhardinge Cc: Chris Wright LKML-Reference: <1258154897-6770-5-git-send-email-hpa@zytor.com> Acked-by: Kees Cook --- arch/x86/include/asm/proto.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/setup.c | 8 ++------ arch/x86/mm/init.c | 4 ++-- arch/x86/mm/setup_nx.c | 43 ++++++------------------------------------- arch/x86/xen/enlighten.c | 4 +--- 6 files changed, 13 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 621f56d73121..add7f18f17a7 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -16,7 +16,7 @@ extern void ia32_sysenter_target(void); extern void syscall32_cpu_init(void); -extern void check_efer(void); +extern void x86_configure_nx(void); extern int reboot_force; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..18346da8c594 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1136,7 +1136,7 @@ void __cpuinit cpu_init(void) wrmsrl(MSR_KERNEL_GS_BASE, 0); barrier(); - check_efer(); + x86_configure_nx(); if (cpu != 0) enable_x2apic(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0a6e94ab8339..23b7f46bf843 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -787,21 +787,17 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; -#ifdef CONFIG_X86_64 /* * Must call this twice: Once just to detect whether hardware doesn't * support NX (so that the early EHCI debug console setup can safely * call set_fixmap(), and then again after parsing early parameters to * honor the respective command line option. */ - check_efer(); -#endif + x86_configure_nx(); parse_early_param(); -#ifdef CONFIG_X86_64 - check_efer(); -#endif + x86_configure_nx(); /* Must be before kernel pagetables are setup */ vmi_activate(); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 73ffd5536f62..27ec2c23fd47 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -146,8 +146,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif - set_nx(); - if (nx_enabled) + /* XXX: replace this with Kees' improved messages */ + if (__supported_pte_mask & _PAGE_NX) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); /* Enable PSE if available */ diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 513d8ed5d2ec..355818b087b5 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -3,10 +3,8 @@ #include #include +#include -int nx_enabled; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static int disable_nx __cpuinitdata; /* @@ -22,48 +20,19 @@ static int __init noexec_setup(char *str) if (!str) return -EINVAL; if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } else if (!strncmp(str, "off", 3)) { disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; } + x86_configure_nx(); return 0; } early_param("noexec", noexec_setup); -#endif - -#ifdef CONFIG_X86_PAE -void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#else -void set_nx(void) -{ -} -#endif -#ifdef CONFIG_X86_64 -void __cpuinit check_efer(void) +void __cpuinit x86_configure_nx(void) { - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) + if (cpu_has_nx && !disable_nx) + __supported_pte_mask |= _PAGE_NX; + else __supported_pte_mask &= ~_PAGE_NX; } -#endif - diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3439616d69f1..c5e805d4a788 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1082,10 +1082,8 @@ asmlinkage void __init xen_start_kernel(void) __supported_pte_mask |= _PAGE_IOMAP; -#ifdef CONFIG_X86_64 /* Work out if we support NX */ - check_efer(); -#endif + x86_configure_nx(); xen_setup_features(); -- cgit v1.2.2 From 4b0f3b81eb33ef18283aa71440cccfede1753ae0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Nov 2009 15:28:17 -0800 Subject: x86, mm: Report state of NX protections during boot It is possible for x86_64 systems to lack the NX bit either due to the hardware lacking support or the BIOS having turned off the CPU capability, so NX status should be reported. Additionally, anyone booting NX-capable CPUs in 32bit mode without PAE will lack NX functionality, so this change provides feedback for that case as well. Signed-off-by: Kees Cook Signed-off-by: H. Peter Anvin LKML-Reference: <1258154897-6770-6-git-send-email-hpa@zytor.com> --- arch/x86/include/asm/proto.h | 1 + arch/x86/kernel/setup.c | 11 ++++++----- arch/x86/mm/init.c | 4 ---- arch/x86/mm/setup_nx.c | 22 ++++++++++++++++++++++ 4 files changed, 29 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index add7f18f17a7..450c56bcd4f8 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -17,6 +17,7 @@ extern void ia32_sysenter_target(void); extern void syscall32_cpu_init(void); extern void x86_configure_nx(void); +extern void x86_report_nx(void); extern int reboot_force; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 23b7f46bf843..d2043a00abc1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -788,16 +788,17 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = command_line; /* - * Must call this twice: Once just to detect whether hardware doesn't - * support NX (so that the early EHCI debug console setup can safely - * call set_fixmap(), and then again after parsing early parameters to - * honor the respective command line option. + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). It may then be called + * again from within noexec_setup() during parsing early parameters + * to honor the respective command line option. */ x86_configure_nx(); parse_early_param(); - x86_configure_nx(); + x86_report_nx(); /* Must be before kernel pagetables are setup */ vmi_activate(); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 27ec2c23fd47..d406c5239019 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif - /* XXX: replace this with Kees' improved messages */ - if (__supported_pte_mask & _PAGE_NX) - printk(KERN_INFO "NX (Execute Disable) protection: active\n"); - /* Enable PSE if available */ if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 355818b087b5..a3250aa34086 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -36,3 +36,25 @@ void __cpuinit x86_configure_nx(void) else __supported_pte_mask &= ~_PAGE_NX; } + +void __init x86_report_nx(void) +{ + if (!cpu_has_nx) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU or disabled in BIOS!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + if (disable_nx) { + printk(KERN_INFO "NX (Execute Disable) protection: " + "disabled by kernel command line option\n"); + } else { + printk(KERN_INFO "NX (Execute Disable) protection: " + "active\n"); + } +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} -- cgit v1.2.2 From 5bd085b5fbd8b0b8685a2173cb9263798fc2a44e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 16 Nov 2009 13:55:31 -0800 Subject: x86: remove "extern" from function prototypes in Function prototypes don't need "extern", and it is generally frowned upon to have them. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/proto.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 450c56bcd4f8..4009f6534f52 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -5,19 +5,19 @@ /* misc architecture specific prototypes */ -extern void early_idt_handler(void); +void early_idt_handler(void); -extern void system_call(void); -extern void syscall_init(void); +void system_call(void); +void syscall_init(void); -extern void ia32_syscall(void); -extern void ia32_cstar_target(void); -extern void ia32_sysenter_target(void); +void ia32_syscall(void); +void ia32_cstar_target(void); +void ia32_sysenter_target(void); -extern void syscall32_cpu_init(void); +void syscall32_cpu_init(void); -extern void x86_configure_nx(void); -extern void x86_report_nx(void); +void x86_configure_nx(void); +void x86_report_nx(void); extern int reboot_force; -- cgit v1.2.2 From d65ff75fbe6f8ac7c17f18e4108521898468822c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 16 Nov 2009 18:06:18 -0500 Subject: x86: Add verbose option to insn decoder test Add verbose option to insn decoder test. This dumps decoded instruction when building kernel with V=1. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Stephen Rothwell Cc: Randy Dunlap Cc: Jim Keniston Cc: Stephen Rothwell LKML-Reference: <20091116230618.5250.18762.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/tools/Makefile | 9 +++++- arch/x86/tools/test_get_len.c | 74 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 71 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 5e295d95dc25..4688f90ce5a2 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -1,6 +1,13 @@ PHONY += posttest + +ifeq ($(KBUILD_VERBOSE),1) + postest_verbose = -v +else + postest_verbose = +endif + quiet_cmd_posttest = TEST $@ - cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len $(CONFIG_64BIT) + cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len -$(CONFIG_64BIT) $(posttest_verbose) posttest: $(obj)/test_get_len vmlinux $(call cmd,posttest) diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 376d33852191..5743e5128d35 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -20,6 +20,7 @@ #include #include #include +#include #define unlikely(cond) (cond) @@ -36,11 +37,16 @@ */ const char *prog; +static int verbose; +static int x86_64; static void usage(void) { fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" - " %s [y|n](64bit flag)\n", prog); + " %s [-y|-n] [-v] \n", prog); + fprintf(stderr, "\t-y 64bit mode\n"); + fprintf(stderr, "\t-n 32bit mode\n"); + fprintf(stderr, "\t-v verbose mode\n"); exit(1); } @@ -50,6 +56,56 @@ static void malformed_line(const char *line, int line_nr) exit(3); } +static void dump_field(FILE *fp, const char *name, const char *indent, + struct insn_field *field) +{ + fprintf(fp, "%s.%s = {\n", indent, name); + fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n", + indent, field->value, field->bytes[0], field->bytes[1], + field->bytes[2], field->bytes[3]); + fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent, + field->got, field->nbytes); +} + +static void dump_insn(FILE *fp, struct insn *insn) +{ + fprintf(fp, "Instruction = { \n"); + dump_field(fp, "prefixes", "\t", &insn->prefixes); + dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); + dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); + dump_field(fp, "opcode", "\t", &insn->opcode); + dump_field(fp, "modrm", "\t", &insn->modrm); + dump_field(fp, "sib", "\t", &insn->sib); + dump_field(fp, "displacement", "\t", &insn->displacement); + dump_field(fp, "immediate1", "\t", &insn->immediate1); + dump_field(fp, "immediate2", "\t", &insn->immediate2); + fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n", + insn->attr, insn->opnd_bytes, insn->addr_bytes); + fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n", + insn->length, insn->x86_64, insn->kaddr); +} + +static void parse_args(int argc, char **argv) +{ + int c; + prog = argv[0]; + while ((c = getopt(argc, argv, "ynv")) != -1) { + switch (c) { + case 'y': + x86_64 = 1; + break; + case 'n': + x86_64 = 0; + break; + case 'v': + verbose = 1; + break; + default: + usage(); + } + } +} + #define BUFSIZE 256 int main(int argc, char **argv) @@ -57,15 +113,9 @@ int main(int argc, char **argv) char line[BUFSIZE]; unsigned char insn_buf[16]; struct insn insn; - int insns = 0; - int x86_64 = 0; - - prog = argv[0]; - if (argc > 2) - usage(); + int insns = 0, c; - if (argc == 2 && argv[1][0] == 'y') - x86_64 = 1; + parse_args(argc, argv); while (fgets(line, BUFSIZE, stdin)) { char copy[BUFSIZE], *s, *tab1, *tab2; @@ -97,8 +147,10 @@ int main(int argc, char **argv) if (insn.length != nb) { fprintf(stderr, "Error: %s", line); fprintf(stderr, "Error: objdump says %d bytes, but " - "insn_get_length() says %d (attr:%x)\n", nb, - insn.length, insn.attr); + "insn_get_length() says %d\n", nb, + insn.length); + if (verbose) + dump_insn(stderr, &insn); exit(2); } } -- cgit v1.2.2 From 35039eb6b199749943547c8572be6604edf00229 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 16 Nov 2009 18:06:24 -0500 Subject: x86: Show symbol name if insn decoder test failed Show symbol name if insn decoder test find a difference. This will help us to find out where the issue is. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Stephen Rothwell Cc: Randy Dunlap Cc: Jim Keniston Cc: Stephen Rothwell LKML-Reference: <20091116230624.5250.49813.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/tools/distill.awk | 5 +++++ arch/x86/tools/test_get_len.c | 10 +++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk index d433619bb866..c13c0ee48ab4 100644 --- a/arch/x86/tools/distill.awk +++ b/arch/x86/tools/distill.awk @@ -15,6 +15,11 @@ BEGIN { fwait_str="9b\tfwait" } +/^ *[0-9a-f]+ <[^>]*>:/ { + # Symbol entry + printf("%s%s\n", $2, $1) +} + /^ *[0-9a-f]+:/ { if (split($0, field, "\t") < 3) { # This is a continuation of the same insn. diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 5743e5128d35..af75e07217ba 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -110,7 +110,7 @@ static void parse_args(int argc, char **argv) int main(int argc, char **argv) { - char line[BUFSIZE]; + char line[BUFSIZE], sym[BUFSIZE] = ""; unsigned char insn_buf[16]; struct insn insn; int insns = 0, c; @@ -122,6 +122,12 @@ int main(int argc, char **argv) int nb = 0; unsigned int b; + if (line[0] == '<') { + /* Symbol line */ + strcpy(sym, line); + continue; + } + insns++; memset(insn_buf, 0, 16); strcpy(copy, line); @@ -145,6 +151,8 @@ int main(int argc, char **argv) insn_init(&insn, insn_buf, x86_64); insn_get_length(&insn); if (insn.length != nb) { + fprintf(stderr, "Error: %s found a difference at %s\n", + prog, sym); fprintf(stderr, "Error: %s", line); fprintf(stderr, "Error: objdump says %d bytes, but " "insn_get_length() says %d\n", nb, -- cgit v1.2.2 From 42109197eb7c01080eea6d9cd48ca23cbc3c566c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sun, 15 Nov 2009 21:19:52 +0900 Subject: x86: gart: Add own dma_mapping_error function GART IOMMU is the only user of bad_dma_address variable. This patch converts GART to use the newer mechanism, fill in ->mapping_error() in struct dma_map_ops, to make dma_mapping_error() work in IOMMU specific way. Signed-off-by: FUJITA Tomonori Acked-by: Jesse Barnes Cc: muli@il.ibm.com Cc: joerg.roedel@amd.com LKML-Reference: <1258287594-8777-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 919182e15d1e..61c4d1e41a6b 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -47,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ static u32 *iommu_gatt_base; /* Remapping table */ +static dma_addr_t bad_dma_addr; + /* * If this is disabled the IOMMU will use an optimized flushing strategy * of only flushing when an mapping is reused. With it true the GART is @@ -217,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); - return bad_dma_address; + return bad_dma_addr; } for (i = 0; i < npages; i++) { @@ -303,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir, 0); - if (addr == bad_dma_address) { + if (addr == bad_dma_addr) { if (i > 0) gart_unmap_sg(dev, sg, i, dir, NULL); nents = 0; @@ -456,7 +458,7 @@ error: iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) - s->dma_address = bad_dma_address; + s->dma_address = bad_dma_addr; return 0; } @@ -480,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, DMA_BIDIRECTIONAL, align_mask); flush_gart(); - if (paddr != bad_dma_address) { + if (paddr != bad_dma_addr) { *dma_addr = paddr; return page_address(page); } @@ -500,6 +502,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_addr); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -687,6 +694,7 @@ static struct dma_map_ops gart_dma_ops = { .unmap_page = gart_unmap_page, .alloc_coherent = gart_alloc_coherent, .free_coherent = gart_free_coherent, + .mapping_error = gart_mapping_error, }; static void gart_iommu_shutdown(void) @@ -785,7 +793,7 @@ int __init gart_iommu_init(void) iommu_start = aper_size - iommu_size; iommu_bus_base = info.aper_base + iommu_start; - bad_dma_address = iommu_bus_base; + bad_dma_addr = iommu_bus_base; iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* -- cgit v1.2.2 From 8fd524b355daef0945692227e726fb444cebcd4f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sun, 15 Nov 2009 21:19:53 +0900 Subject: x86: Kill bad_dma_address variable This kills bad_dma_address variable, the old mechanism to enable IOMMU drivers to make dma_mapping_error() work in IOMMU's specific way. bad_dma_address variable was introduced to enable IOMMU drivers to make dma_mapping_error() work in IOMMU's specific way. However, it can't handle systems that use both swiotlb and HW IOMMU. SO we introduced dma_map_ops->mapping_error to solve that case. Intel VT-d, GART, and swiotlb already use dma_map_ops->mapping_error. Calgary, AMD IOMMU, and nommu use zero for an error dma address. This adds DMA_ERROR_CODE and converts them to use it (as SPARC and POWER does). Signed-off-by: FUJITA Tomonori Acked-by: Jesse Barnes Cc: muli@il.ibm.com Cc: joerg.roedel@amd.com LKML-Reference: <1258287594-8777-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dma-mapping.h | 5 +++-- arch/x86/kernel/amd_iommu.c | 21 ++++++++++----------- arch/x86/kernel/pci-calgary_64.c | 22 ++++++++++------------ arch/x86/kernel/pci-dma.c | 3 --- arch/x86/kernel/pci-nommu.c | 2 +- 5 files changed, 24 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 6a25d5d42836..0f6c02f3b7d4 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -20,7 +20,8 @@ # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) #endif -extern dma_addr_t bad_dma_address; +#define DMA_ERROR_CODE 0 + extern int iommu_merge; extern struct device x86_dma_fallback_dev; extern int panic_on_overflow; @@ -48,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); - return (dma_addr == bad_dma_address); + return (dma_addr == DMA_ERROR_CODE); } #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 66237fde758f..093bd526c949 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -928,7 +928,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, } if (unlikely(address == -1)) - address = bad_dma_address; + address = DMA_ERROR_CODE; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -1544,7 +1544,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, pte = dma_ops_get_pte(dom, address); if (!pte) - return bad_dma_address; + return DMA_ERROR_CODE; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1625,7 +1625,7 @@ static dma_addr_t __map_single(struct device *dev, retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) { + if (unlikely(address == DMA_ERROR_CODE)) { /* * setting next_address here will let the address * allocator only scan the new allocated range in the @@ -1646,7 +1646,7 @@ retry: start = address; for (i = 0; i < pages; ++i) { ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); - if (ret == bad_dma_address) + if (ret == DMA_ERROR_CODE) goto out_unmap; paddr += PAGE_SIZE; @@ -1674,7 +1674,7 @@ out_unmap: dma_ops_free_addresses(dma_dom, address, pages); - return bad_dma_address; + return DMA_ERROR_CODE; } /* @@ -1690,7 +1690,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_addr_t i, start; unsigned int pages; - if ((dma_addr == bad_dma_address) || + if ((dma_addr == DMA_ERROR_CODE) || (dma_addr + size > dma_dom->aperture_size)) return; @@ -1732,7 +1732,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, INC_STATS_COUNTER(cnt_map_single); if (!check_device(dev)) - return bad_dma_address; + return DMA_ERROR_CODE; dma_mask = *dev->dma_mask; @@ -1743,12 +1743,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page, return (dma_addr_t)paddr; if (!dma_ops_domain(domain)) - return bad_dma_address; + return DMA_ERROR_CODE; spin_lock_irqsave(&domain->lock, flags); addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, dma_mask); - if (addr == bad_dma_address) + if (addr == DMA_ERROR_CODE) goto out; iommu_completion_wait(iommu); @@ -1957,7 +1957,7 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { + if (*dma_addr == DMA_ERROR_CODE) { spin_unlock_irqrestore(&domain->lock, flags); goto out_free; } @@ -2110,7 +2110,6 @@ int __init amd_iommu_init_dma_ops(void) prealloc_protection_domains(); iommu_detected = 1; - bad_dma_address = 0; swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index c84ad037f586..af9f436096a2 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -245,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return bad_dma_address; + return DMA_ERROR_CODE; } } @@ -261,11 +261,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { unsigned long entry; - dma_addr_t ret = bad_dma_address; + dma_addr_t ret = DMA_ERROR_CODE; entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == bad_dma_address)) + if (unlikely(entry == DMA_ERROR_CODE)) goto error; /* set the return dma address */ @@ -280,7 +280,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, error: printk(KERN_WARNING "Calgary: failed to allocate %u pages in " "iommu %p\n", npages, tbl); - return bad_dma_address; + return DMA_ERROR_CODE; } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, @@ -291,8 +291,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; @@ -374,7 +374,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == bad_dma_address) { + if (entry == DMA_ERROR_CODE) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -392,7 +392,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, NULL); for_each_sg(sg, s, nelems, i) { - sg->dma_address = bad_dma_address; + sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; } return 0; @@ -447,7 +447,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) + if (mapping == DMA_ERROR_CODE) goto free; *dma_handle = mapping; return ret; @@ -728,7 +728,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); + iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ @@ -1359,8 +1359,6 @@ static int __init calgary_iommu_init(void) return ret; } - bad_dma_address = 0x0; - return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index bf621b9ee26e..afcc58b69c7c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -43,9 +43,6 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; -dma_addr_t bad_dma_address __read_mostly = 0; -EXPORT_SYMBOL(bad_dma_address); - /* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 875e3822ae61..22be12b60a8f 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return bad_dma_address; + return DMA_ERROR_CODE; flush_write_buffers(); return bus; } -- cgit v1.2.2 From 1f7564ca831a00b21bb493ef174c845b2ba9e64d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sun, 15 Nov 2009 21:19:54 +0900 Subject: x86: Calgary: Remove unnecessary DMA_ERROR_CODE usage This cleans up iommu_alloc() a bit and removes unnecessary DMA_ERROR_CODE usage. Signed-off-by: FUJITA Tomonori Acked-by: Jesse Barnes Cc: muli@il.ibm.com Cc: joerg.roedel@amd.com LKML-Reference: <1258287594-8777-4-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index af9f436096a2..849a0995d970 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -261,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { unsigned long entry; - dma_addr_t ret = DMA_ERROR_CODE; + dma_addr_t ret; entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_ERROR_CODE)) - goto error; + if (unlikely(entry == DMA_ERROR_CODE)) { + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return DMA_ERROR_CODE; + } /* set the return dma address */ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); @@ -274,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* put the TCEs in the HW table */ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - return ret; - -error: - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); - return DMA_ERROR_CODE; } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, -- cgit v1.2.2 From 123bf0e2eddcda36a33bdfc87aa1fb07229f07b5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Nov 2009 21:19:52 +0900 Subject: x86: gart: Clean up the code a bit Clean up various small stylistic details in the GART code. No functionality changed. Cc: FUJITA Tomonori Cc: Jesse Barnes Cc: muli@il.ibm.com Cc: joerg.roedel@amd.com LKML-Reference: <1258287594-8777-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 116 ++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 61c4d1e41a6b..e6a0d402f171 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -95,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); @@ -297,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, int i; #ifdef CONFIG_IOMMU_DEBUG - printk(KERN_DEBUG "dma_map_sg overflow\n"); + pr_debug("dma_map_sg overflow\n"); #endif for_each_sg(sg, s, nents, i) { @@ -392,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, if (!dev) dev = &x86_dma_fallback_dev; - out = 0; - start = 0; - start_sg = sgmap = sg; - seg_size = 0; - max_seg_size = dma_get_max_seg_size(dev); - ps = NULL; /* shut up gcc */ + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); @@ -420,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, sgmap, pages, need) < 0) goto error; out++; - seg_size = 0; - sgmap = sg_next(sgmap); - pages = 0; - start = i; - start_sg = s; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; } } @@ -523,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - printk(KERN_WARNING + pr_warning( "PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", iommu_size >> 20); @@ -578,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static int gart_resume(struct sys_device *dev) +static void gart_fixup_northbridges(struct sys_device *dev) { - printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); + int i; - if (fix_up_north_bridges) { - int i; + if (!fix_up_north_bridges) + return; - printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < num_k8_northbridges; i++) { + struct pci_dev *dev = k8_northbridges[i]; - /* - * Don't enable translations just yet. That is the next - * step. Restore the pre-suspend aperture settings. - */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, - aperture_order << 1); - pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, - aperture_alloc >> 25); - } + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } +} + +static int gart_resume(struct sys_device *dev) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(dev); enable_gart_translations(); @@ -612,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state) } static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, - .resume = gart_resume, + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, }; static struct sys_device device_gart = { - .id = 0, - .cls = &gart_sysdev_class, + .cls = &gart_sysdev_class, }; /* @@ -635,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) void *gatt; int i, error; - printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + pr_info("PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { @@ -653,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) } if (!aper_base) goto nommu; + info->aper_base = aper_base; info->aper_size = aper_size >> 20; @@ -675,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) flush_gart(); - printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); return 0; nommu: /* Should not happen anymore */ - printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" "falling back to iommu=soft.\n"); return -1; } @@ -744,23 +752,23 @@ int __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n"); - printk(KERN_WARNING "falling back to iommu=soft.\n"); + pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warning("falling back to iommu=soft.\n"); } return 0; } /* need to map that range */ - aper_size = info.aper_size << 20; - aper_base = info.aper_base; - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { start_pfn = (aper_base>>PAGE_SHIFT); init_memory_mapping(start_pfn<> PAGE_SHIFT; @@ -775,8 +783,7 @@ int __init gart_iommu_init(void) ret = dma_debug_resize_entries(iommu_pages); if (ret) - printk(KERN_DEBUG - "PCI-DMA: Cannot trace all the entries\n"); + pr_debug("PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -786,15 +793,14 @@ int __init gart_iommu_init(void) */ iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; - printk(KERN_INFO - "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_addr = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* * Unmap the IOMMU part of the GART. The alias of the page is @@ -816,7 +822,7 @@ int __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); - + /* * Now all caches are flushed and we can safely enable * GART hardware. Doing it early leaves the possibility -- cgit v1.2.2 From 8cc2361bd00e87aab2827a3996a71fe9b2c9f9c4 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 17 Nov 2009 08:06:38 +0100 Subject: x86: ucode-amd: Move family check to microcde_amd.c's init function ... to avoid useless trial to load firmware on systems with unsupported AMD CPUs. Signed-off-by: Andreas Herrmann Cc: Dmitry Adamushko Cc: Mike Travis Cc: Tigran Aivazian Cc: Borislav Petkov Cc: Andreas Mohr Cc: Jack Steiner LKML-Reference: <20091117070638.GA27691@alberich.amd.com> [ v2: changed BUG_ON() to WARN_ON() ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 26e33bd8485b..63123d902103 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -34,6 +34,7 @@ MODULE_LICENSE("GPL v2"); #define UCODE_UCODE_TYPE 0x00000001 const struct firmware *firmware; +static int supported_cpu; struct equiv_cpu_entry { u32 installed_cpu; @@ -73,15 +74,12 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + if (!supported_cpu) return -1; - } + + memset(csig, 0, sizeof(*csig)); rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; @@ -331,6 +329,17 @@ static void microcode_fini_cpu_amd(int cpu) void init_microcode_amd(struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &boot_cpu_data; + + WARN_ON(c->x86_vendor != X86_VENDOR_AMD); + + if (c->x86 < 0x10) { + pr_warning("microcode: AMD CPU family 0x%x not supported\n", + c->x86); + return; + } + supported_cpu = 1; + if (request_firmware(&firmware, fw_name, device)) pr_err("microcode: failed to load file %s\n", fw_name); } -- cgit v1.2.2 From 0696b711e4be45fa104c12329f617beb29c03f78 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Tue, 17 Nov 2009 13:49:50 +0800 Subject: timekeeping: Fix clock_gettime vsyscall time warp Since commit 0a544198 "timekeeping: Move NTP adjusted clock multiplier to struct timekeeper" the clock multiplier of vsyscall is updated with the unmodified clock multiplier of the clock source and not with the NTP adjusted multiplier of the timekeeper. This causes user space observerable time warps: new CLOCK-warp maximum: 120 nsecs, 00000025c337c537 -> 00000025c337c4bf Add a new argument "mult" to update_vsyscall() and hand in the timekeeping internal NTP adjusted multiplier. Signed-off-by: Lin Ming Cc: "Zhang Yanmin" Cc: Martin Schwidefsky Cc: Benjamin Herrenschmidt Cc: Tony Luck LKML-Reference: <1258436990.17765.83.camel@minggr.sh.intel.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8cb4974ff599..62f39d79b775 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,7 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, + u32 mult) { unsigned long flags; @@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; -- cgit v1.2.2 From 508d85c2c6bc8cba53d2a54d9a306ad64a0a80bf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Nov 2009 23:04:56 -0800 Subject: x86: When cleaning MTRRs, do not fold WP into UC The current MTRR code treats WP as a form of UC. This really isn't desirable behaviour, except possibly in the case of severe MTRR shortage. Disable this, to allow legitimate uses of WP to remain unmolested. Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin Cc: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 315738c74aad..6e49f6f91f31 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -689,8 +689,6 @@ static int __init mtrr_need_cleanup(void) continue; if (!size) type = MTRR_NUM_TYPES; - if (type == MTRR_TYPE_WRPROT) - type = MTRR_TYPE_UNCACHABLE; num[type]++; } -- cgit v1.2.2 From 6dbfe5a57db3564adf7b2a65068e40f1b4a0d2db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 18:27:18 +0100 Subject: x86: Fixup last users of irq_chip->typename The typename member of struct irq_chip was kept for migration purposes and is obsolete since more than 2 years. Fix up the leftovers. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/visws_quirks.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index f068553a1b17..f084dfd97fcb 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq) } static struct irq_chip cobalt_irq_type = { - .typename = "Cobalt-APIC", + .name = "Cobalt-APIC", .startup = startup_cobalt_irq, .shutdown = disable_cobalt_irq, .enable = enable_cobalt_irq, @@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq) } static struct irq_chip piix4_master_irq_type = { - .typename = "PIIX4-master", + .name = "PIIX4-master", .startup = startup_piix4_master_irq, .ack = ack_cobalt_irq, .end = end_piix4_master_irq, @@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { - .typename = "PIIX4-virtual", + .name = "PIIX4-virtual", .shutdown = disable_8259A_irq, .enable = enable_8259A_irq, .disable = disable_8259A_irq, -- cgit v1.2.2 From 070e5c3f9989a72076e83fdd5ede3f0f3eb17264 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2009 12:27:47 +0100 Subject: x86: vmiclock: Fix printk format clockevents.mult became u32. Fix the printk format. Pointed-out-by: Randy Dunlap Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vmiclock_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 611b9e2360d3..74c92bb194df 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void) evt->min_delta_ns = clockevent_delta2ns(1, evt); evt->cpumask = cpumask_of(cpu); - printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", + printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n", evt->name, evt->mult, evt->shift); clockevents_register_device(evt); } -- cgit v1.2.2 From 350f8f5631922c7848ec4b530c111cb8c2ff7caa Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 13 Nov 2009 11:54:40 +0000 Subject: x86: Eliminate redundant/contradicting cache line size config options Rather than having X86_L1_CACHE_BYTES and X86_L1_CACHE_SHIFT (with inconsistent defaults), just having the latter suffices as the former can be easily calculated from it. To be consistent, also change X86_INTERNODE_CACHE_BYTES to X86_INTERNODE_CACHE_SHIFT, and set it to 7 (128 bytes) for NUMA to account for last level cache line size (which here matters more than L1 cache line size). Finally, make sure the default value for X86_L1_CACHE_SHIFT, when X86_GENERIC is selected, is being seen before that for the individual CPU model options (other than on x86-64, where GENERIC_CPU is part of the choice construct, X86_GENERIC is a separate option on ix86). Signed-off-by: Jan Beulich Acked-by: Ravikiran Thirumalai Acked-by: Nick Piggin LKML-Reference: <4AFD5710020000780001F8F0@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 14 +++++--------- arch/x86/boot/compressed/vmlinux.lds.S | 3 ++- arch/x86/include/asm/cache.h | 7 ++++--- arch/x86/kernel/vmlinux.lds.S | 10 +++++----- arch/x86/mm/tlb.c | 3 ++- 5 files changed, 18 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index f2824fb8c79c..621f2bd0ef56 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -301,15 +301,11 @@ config X86_CPU # # Define implied options from the CPU selection here -config X86_L1_CACHE_BYTES +config X86_INTERNODE_CACHE_SHIFT int - default "128" if MPSC - default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 - -config X86_INTERNODE_CACHE_BYTES - int - default "4096" if X86_VSMP - default X86_L1_CACHE_BYTES if !X86_VSMP + default "12" if X86_VSMP + default "7" if NUMA + default X86_L1_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) @@ -317,9 +313,9 @@ config X86_CMPXCHG config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU config X86_XADD def_bool y diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index f4193bb48782..a6f1a59a5b0c 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) #undef i386 +#include #include #ifdef CONFIG_X86_64 @@ -46,7 +47,7 @@ SECTIONS *(.data.*) _edata = . ; } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .bss : { _bss = . ; *(.bss) diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index 549860d3be8f..2f9047cfaaca 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h @@ -9,12 +9,13 @@ #define __read_mostly __attribute__((__section__(".data.read_mostly"))) +#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT +#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT) + #ifdef CONFIG_X86_VSMP -/* vSMP Internode cacheline shift */ -#define INTERNODE_CACHE_SHIFT (12) #ifdef CONFIG_SMP #define __cacheline_aligned_in_smp \ - __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ + __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \ __page_aligned_data #endif #endif diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index fd2dabec1dff..eeb4f5fbd86f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -135,13 +135,13 @@ SECTIONS PAGE_ALIGNED_DATA(PAGE_SIZE) - CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) DATA_DATA CONSTRUCTORS /* rarely changed data like cpu maps */ - READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) + READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) /* End of data section */ _edata = .; @@ -165,12 +165,12 @@ SECTIONS *(.vsyscall_0) } :user - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } @@ -194,7 +194,7 @@ SECTIONS } vgetcpu_mode = VVIRT(.vgetcpu_mode); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 36fe08eeb5c3..65b58e4b0b8b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -43,7 +44,7 @@ union smp_flush_state { spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; - char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; + char pad[INTERNODE_CACHE_BYTES]; } ____cacheline_internodealigned_in_smp; /* State is put into the per CPU data section, but padded -- cgit v1.2.2 From ce64c62074d945fe5f8a7f01bdc30125f994ea67 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 16 Nov 2009 18:06:31 -0500 Subject: x86: Instruction decoder test should generate build warning Since some instructions are not decoded correctly by older versions of objdump, it may cause false positive error in insn decoder posttest. This changes build error of insn decoder test to build warning. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Stephen Rothwell Cc: Randy Dunlap Cc: Jim Keniston Cc: Stephen Rothwell LKML-Reference: <20091116230631.5250.41579.stgit@harusame> Signed-off-by: Ingo Molnar --- arch/x86/tools/test_get_len.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index af75e07217ba..d8214dc03fa7 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -114,6 +114,7 @@ int main(int argc, char **argv) unsigned char insn_buf[16]; struct insn insn; int insns = 0, c; + int warnings = 0; parse_args(argc, argv); @@ -151,18 +152,22 @@ int main(int argc, char **argv) insn_init(&insn, insn_buf, x86_64); insn_get_length(&insn); if (insn.length != nb) { - fprintf(stderr, "Error: %s found a difference at %s\n", + warnings++; + fprintf(stderr, "Warning: %s found difference at %s\n", prog, sym); - fprintf(stderr, "Error: %s", line); - fprintf(stderr, "Error: objdump says %d bytes, but " + fprintf(stderr, "Warning: %s", line); + fprintf(stderr, "Warning: objdump says %d bytes, but " "insn_get_length() says %d\n", nb, insn.length); if (verbose) dump_insn(stderr, &insn); - exit(2); } } - fprintf(stderr, "Succeed: decoded and checked %d instructions\n", - insns); + if (warnings) + fprintf(stderr, "Warning: decoded and checked %d" + " instructions with %d warnings\n", insns, warnings); + else + fprintf(stderr, "Succeed: decoded and checked %d" + " instructions\n", insns); return 0; } -- cgit v1.2.2 From 746357d6a526d6da9d89a2ec645b28406e959c2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 Nov 2009 12:01:43 +0100 Subject: x86: Prevent GCC 4.4.x (pentium-mmx et al) function prologue wreckage When the kernel is compiled with -pg for tracing GCC 4.4.x inserts stack alignment of a function _before_ the mcount prologue if the -march=pentium-mmx is set and -mtune=generic is not set. This breaks the assumption of the function graph tracer which expects that the mcount prologue push %ebp mov %esp, %ebp is the first stack operation in a function because it needs to modify the function return address on the stack to trap into the tracer before returning to the real caller. The generated code is: push %edi lea 0x8(%esp),%edi and $0xfffffff0,%esp pushl -0x4(%edi) push %ebp mov %esp,%ebp so the tracer modifies the copy of the return address which is stored after the stack alignment and therefor does not trap the return which in turn breaks the call chain logic of the tracer and leads to a kernel panic. Aside of the fact that the generated code is horrible for no good reason other -march -mtune options generate the expected: push %ebp mov %esp,%ebp and $0xfffffff0,%esp which does the same and keeps everything intact. After some experimenting we found out that this problem is restricted to gcc4.4.x and to the following -march settings: i586, pentium, pentium-mmx, k6, k6-2, k6-3, winchip-c6, winchip2, c3, geode By adding -mtune=generic the code generator produces always the expected code. So forcing -mtune=generic when CONFIG_FUNCTION_GRAPH_TRACER=y is not pretty, but at the moment the only way to prevent that the kernel trips over gcc-shrooms induced code madness. Most distro kernels have CONFIG_X86_GENERIC=y anyway which forces -mtune=generic as well so it will not impact those. References: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42109 http://lkml.org/lkml/2009/11/19/17 Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Linus Torvalds Cc: Andrew Morton Cc: Ingo Molnar Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Steven Rostedt Cc: Frederic Weisbecker , Cc: Jeff Law Cc: gcc@gcc.gnu.org Cc: David Daney Cc: Andrew Haley Cc: Richard Guenther Cc: stable@kernel.org --- arch/x86/Makefile_32.cpu | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 30e9a264f69d..df7fdf811997 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -46,6 +46,12 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) +# Work around the pentium-mmx code generator madness of gcc4.4.x which +# does stack alignment by generating horrible code _before_ the mcount +# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph +# tracer assumptions +cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-mtune=generic) + # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. ifneq ($(CONFIG_X86_P6_NOP),y) -- cgit v1.2.2 From 80509e27e40d7554e576405ed9f5b7966c567112 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 20 Nov 2009 12:13:08 -0500 Subject: x86: Fix insn decoder test typos Fix postest_verbose to posttest_verbose, and add posttest_64bit option for CONFIG_64BIT != y, since old command just passed '-' instead of '-n' when CONFIG_64BIT is not set. Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Stephen Rothwell Cc: Randy Dunlap Cc: Jim Keniston LKML-Reference: <20091120171307.6715.66099.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/tools/Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 4688f90ce5a2..c80b0792cd83 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -1,13 +1,19 @@ PHONY += posttest ifeq ($(KBUILD_VERBOSE),1) - postest_verbose = -v + posttest_verbose = -v else - postest_verbose = + posttest_verbose = +endif + +ifeq ($(CONFIG_64BIT),y) + posttest_64bit = -y +else + posttest_64bit = -n endif quiet_cmd_posttest = TEST $@ - cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len -$(CONFIG_64BIT) $(posttest_verbose) + cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) posttest: $(obj)/test_get_len vmlinux $(call cmd,posttest) -- cgit v1.2.2 From 6f5f67267dc4faecd9cba63894de92ca92a608b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 20 Nov 2009 12:13:14 -0500 Subject: x86: insn decoder test checks objdump version Check objdump version before using it for insn decoder build test, because some older objdump can't decode AVX code correctly. Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Stephen Rothwell Cc: Randy Dunlap Cc: Jim Keniston LKML-Reference: <20091120171314.6715.30390.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/tools/Makefile | 5 ++++- arch/x86/tools/chkobjdump.awk | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 arch/x86/tools/chkobjdump.awk (limited to 'arch/x86') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index c80b0792cd83..f82082677337 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -12,8 +12,11 @@ else posttest_64bit = -n endif +distill_awk = $(srctree)/arch/x86/tools/distill.awk +chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk + quiet_cmd_posttest = TEST $@ - cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) + cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) posttest: $(obj)/test_get_len vmlinux $(call cmd,posttest) diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk new file mode 100644 index 000000000000..0d13cd9fdcff --- /dev/null +++ b/arch/x86/tools/chkobjdump.awk @@ -0,0 +1,23 @@ +# GNU objdump version checker +# +# Usage: +# objdump -v | awk -f chkobjdump.awk +BEGIN { + # objdump version 2.19 or later is OK for the test. + od_ver = 2; + od_sver = 19; +} + +/^GNU/ { + split($4, ver, "."); + if (ver[1] > od_ver || + (ver[1] == od_ver && ver[2] >= od_sver)) { + exit 1; + } else { + printf("Warning: objdump version %s is older than %d.%d\n", + $4, od_ver, od_sver); + print("Warning: Skipping posttest."); + # Logic is inverted, because we just skip test without error. + exit 0; + } +} -- cgit v1.2.2 From 44280733e71ad15377735b42d8538c109c94d7e3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 22 Nov 2009 17:18:49 -0800 Subject: x86: Change crash kernel to reserve via reserve_early() use find_e820_area()/reserve_early() instead. -v2: address Eric's request, to restore original semantics. will fail, if the provided address can not be used. Signed-off-by: Yinghai Lu Acked-by: Eric W. Biederman LKML-Reference: <4B09E2F9.7040403@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 57 +++++++++++++------------------------------------ 1 file changed, 15 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d2043a00abc1..e3eae5965e4a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -487,42 +487,11 @@ static void __init reserve_early_setup_data(void) #ifdef CONFIG_KEXEC -/** - * Reserve @size bytes of crashkernel memory at any suitable offset. - * - * @size: Size of the crashkernel memory to reserve. - * Returns the base address on success, and -1ULL on failure. - */ -static -unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) -{ - const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long start = 0LL; - - while (1) { - int ret; - - start = find_e820_area(start, ULONG_MAX, size, alignment); - if (start == -1ULL) - return start; - - /* try to reserve it */ - ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE); - if (ret >= 0) - return start; - - start += alignment; - } -} - static inline unsigned long long get_total_mem(void) { unsigned long long total; - total = max_low_pfn - min_low_pfn; -#ifdef CONFIG_HIGHMEM - total += highend_pfn - highstart_pfn; -#endif + total = max_pfn - min_low_pfn; return total << PAGE_SHIFT; } @@ -542,21 +511,25 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { - crash_base = find_and_reserve_crashkernel(crash_size); + const unsigned long long alignment = 16<<20; /* 16M */ + + crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, + alignment); if (crash_base == -1ULL) { - pr_info("crashkernel reservation failed. " - "No suitable area found.\n"); + pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } } else { - ret = reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE); - if (ret < 0) { - pr_info("crashkernel reservation failed - " - "memory is in use\n"); + unsigned long long start; + + start = find_e820_area(crash_base, ULONG_MAX, crash_size, + 1<<20); + if (start != crash_base) { + pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } + reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -927,6 +900,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); + reserve_crashkernel(); + vsmp_init(); io_delay_init(); @@ -957,8 +932,6 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); - reserve_crashkernel(); - #ifdef CONFIG_X86_64 /* * dma32_reserve_bootmem() allocates bootmem which may conflict -- cgit v1.2.2 From e670761f12f4069d204f433bf547d9c679a4fd05 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86: apic: Remove not needed #ifdef Suresh made dmar_table_init() already have that protection. Signed-off-by: Yinghai Lu LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4c689f45b238..ad8c75b9e453 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1377,14 +1377,11 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; int ret, x2apic_enabled = 0; - int dmar_table_init_ret = 0; + int dmar_table_init_ret; -#ifdef CONFIG_INTR_REMAP dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret) - pr_debug("dmar_table_init() failed with %d:\n", - dmar_table_init_ret); -#endif + if (dmar_table_init_ret && !x2apic_supported()) + return; ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { -- cgit v1.2.2 From 37ef2a3029fde884808ff1b369677abc7dd9a79a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86: Re-get cfg_new in case reuse/move irq_desc When irq_desc is moved, we need to make sure to use the right cfg_new. Signed-off-by: Yinghai Lu LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ff237199fa23..085e60e303cf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3186,6 +3186,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) continue; desc_new = move_irq_desc(desc_new, node); + cfg_new = desc_new->chip_data; if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; -- cgit v1.2.2 From 163d3866cfa79aa5945f1ee5e43fb3ed1455f75c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86: apic: Print out SRAT table APIC id in hex Make it consistent with APIC MADT print out, for big systems APIC id in hex is more readable. Signed-off-by: Yinghai Lu LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index dbb5381f7b3b..9d7ce96e5a5c 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } @@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } -- cgit v1.2.2 From 021428ad1418cf3c386a1a0157140c3ea29b17ef Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86, numa, bootmem: Only free bootmem on NUMA failure path In the NUMA bootmem setup failure path we freed nodedata_phys incorrectly. Signed-off-by: Yinghai Lu Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Rusty Russell Cc: David Rientjes Cc: Andrew Morton LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 086f98a66d80..3acd870d316a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<= end) - free_bootmem(nodedata_phys, pgdat_size); + if (nodedata_phys < start || nodedata_phys >= end) { + /* + * only need to free it if it is from other node + * bootmem + */ + if (nid != nodeid) + free_bootmem(nodedata_phys, pgdat_size); + } node_data[nodeid] = NULL; return; } -- cgit v1.2.2 From d9c2d5ac6af87b4491bff107113aaf16f6c2b2d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86, numa: Use near(er) online node instead of roundrobin for NUMA CPU to node mapping is set via the following sequence: 1. numa_init_array(): Set up roundrobin from cpu to online node 2. init_cpu_to_node(): Set that according to apicid_to_node[] according to srat only handle the node that is online, and leave other cpu on node without ram (aka not online) to still roundrobin. 3. later call srat_detect_node for Intel/AMD, will use first_online node or nearby node. Problem is that setup_per_cpu_areas() is not called between 2 and 3, the per_cpu for cpu on node with ram is on different node, and could put that on node with two hops away. So try to optimize this and add find_near_online_node() and call init_cpu_to_node(). Signed-off-by: Yinghai Lu Cc: Tejun Heo Cc: Linus Torvalds Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Rusty Russell Cc: David Rientjes Cc: Andrew Morton LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 6 +++++- arch/x86/mm/numa_64.c | 21 ++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 40e1835b35e8..c900b73f9224 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE || !node_online(node)) + if (node == NUMA_NO_NODE) node = first_node(node_online_map); + else if (!node_online(node)) { + /* reuse the value from init_cpu_to_node() */ + node = cpu_to_node(cpu); + } numa_set_node(cpu, node); printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3acd870d316a..83bbc70d11bb 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -764,6 +764,25 @@ static __init int numa_setup(char *opt) early_param("numa", numa_setup); #ifdef CONFIG_NUMA + +static __init int find_near_online_node(int node) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(n) { + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + return best_node; +} + /* * Setup early cpu_to_node. * @@ -795,7 +814,7 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; if (!node_online(node)) - continue; + node = find_near_online_node(node); numa_set_node(cpu, node); } } -- cgit v1.2.2 From 6e3d8330ae2c4b2c11a9577a0130d2ecda1c610d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Nov 2009 10:19:20 +0100 Subject: perf events: Do not generate function trace entries in perf code Decreases perf overhead when function tracing is enabled, by about 50%. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 68537e957a9b..1d2cb383410e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -5,6 +5,7 @@ # Don't trace early stages of a secondary CPU boot ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif # Make sure load_percpu_segment has no stackprotector -- cgit v1.2.2 From 0e7810be30f66e9f430c4ce2cd3b14634211690f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 20 Nov 2009 14:00:14 +0000 Subject: x86: Suppress stack overrun message for init_task init_task doesn't get its stack end location set to STACK_END_MAGIC, and hence the message is confusing rather than helpful in this case. Signed-off-by: Jan Beulich LKML-Reference: <4B06AEFE02000078000211F4@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f4cee9028cf0..071eee604147 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -658,7 +658,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); stackend = end_of_stack(tsk); - if (*stackend != STACK_END_MAGIC) + if (tsk != &init_task && *stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; -- cgit v1.2.2 From 9f800de38b05d84809e89f16671d636a140eede7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 12:45:25 +0100 Subject: x86/amd-iommu: un__init iommu_setup_msi This function may be called on the resume path and can not be dropped after booting. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0d4581e602a4..72bdbdac9b48 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -926,7 +926,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msi(struct amd_iommu *iommu) +static int iommu_setup_msi(struct amd_iommu *iommu) { int r; -- cgit v1.2.2 From b369e521237d6ef21c453f3ac4f4b8577ec14f87 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 23 Nov 2009 19:54:06 +0800 Subject: crypto: aesni-intel - Use gas macro for AES-NI instructions Old binutils do not support AES-NI instructions, to make kernel can be compiled by them, .byte code is used instead of AES-NI assembly instructions. But the readability and flexibility of raw .byte code is not good. So corresponding assembly instruction like gas macro is used instead. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 517 +++++++++++++------------------------- 1 file changed, 173 insertions(+), 344 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index eb0566e83319..20bb0e1ac681 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -16,6 +16,7 @@ */ #include +#include .text @@ -122,103 +123,72 @@ ENTRY(aesni_set_key) movups 0x10(%rsi), %xmm2 # other user key movaps %xmm2, (%rcx) add $0x10, %rcx - # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_256a - # aeskeygenassist $0x1, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + AESKEYGENASSIST 0x1 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 call _key_expansion_256a - # aeskeygenassist $0x2, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + AESKEYGENASSIST 0x2 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 call _key_expansion_256a - # aeskeygenassist $0x4, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + AESKEYGENASSIST 0x4 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 call _key_expansion_256a - # aeskeygenassist $0x8, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + AESKEYGENASSIST 0x8 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 call _key_expansion_256a - # aeskeygenassist $0x10, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + AESKEYGENASSIST 0x10 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 call _key_expansion_256a - # aeskeygenassist $0x20, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + AESKEYGENASSIST 0x20 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 call _key_expansion_256a jmp .Ldec_key .Lenc_key192: movq 0x10(%rsi), %xmm2 # other user key - # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_192a - # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 call _key_expansion_192b - # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 call _key_expansion_192a - # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 call _key_expansion_192b - # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 call _key_expansion_192a - # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 call _key_expansion_192b - # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 call _key_expansion_192a - # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80 + AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 call _key_expansion_192b jmp .Ldec_key .Lenc_key128: - # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 call _key_expansion_128 - # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 call _key_expansion_128 - # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 call _key_expansion_128 - # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 call _key_expansion_128 - # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 call _key_expansion_128 - # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 call _key_expansion_128 - # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40 + AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 call _key_expansion_128 - # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80 + AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 call _key_expansion_128 - # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b + AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 call _key_expansion_128 - # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36 + AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 call _key_expansion_128 .Ldec_key: sub $0x10, %rcx @@ -231,8 +201,7 @@ ENTRY(aesni_set_key) .align 4 .Ldec_key_loop: movaps (%rdi), %xmm0 - # aesimc %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8 + AESIMC %xmm0 %xmm1 movaps %xmm1, (%rsi) add $0x10, %rdi sub $0x10, %rsi @@ -274,51 +243,37 @@ _aesni_enc1: je .Lenc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x50(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE .align 4 .Lenc192: movaps -0x40(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x30(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE .align 4 .Lenc128: movaps -0x20(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x10(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps (TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x10(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x20(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x30(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x40(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x50(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x60(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x70(TKEYP), KEY - # aesenclast KEY, STATE # last round - .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 + AESENCLAST KEY STATE ret /* @@ -353,135 +308,79 @@ _aesni_enc4: je .L4enc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x50(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 #.align 4 .L4enc192: movaps -0x40(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x30(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 #.align 4 .L4enc128: movaps -0x20(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x10(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps (TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x10(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x20(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x30(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x40(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x50(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x60(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x70(TKEYP), KEY - # aesenclast KEY, STATE1 # last round - .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 - # aesenclast KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 - # aesenclast KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xea - # aesenclast KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2 + AESENCLAST KEY STATE1 # last round + AESENCLAST KEY STATE2 + AESENCLAST KEY STATE3 + AESENCLAST KEY STATE4 ret /* @@ -518,51 +417,37 @@ _aesni_dec1: je .Ldec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x50(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE .align 4 .Ldec192: movaps -0x40(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x30(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE .align 4 .Ldec128: movaps -0x20(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x10(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps (TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x10(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x20(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x30(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x40(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x50(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x60(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x70(TKEYP), KEY - # aesdeclast KEY, STATE # last round - .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 + AESDECLAST KEY STATE ret /* @@ -597,135 +482,79 @@ _aesni_dec4: je .L4dec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x50(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 .align 4 .L4dec192: movaps -0x40(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x30(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 .align 4 .L4dec128: movaps -0x20(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x10(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps (TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x10(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x20(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x30(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x40(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x50(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x60(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x70(TKEYP), KEY - # aesdeclast KEY, STATE1 # last round - .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 - # aesdeclast KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 - # aesdeclast KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xea - # aesdeclast KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2 + AESDECLAST KEY STATE1 # last round + AESDECLAST KEY STATE2 + AESDECLAST KEY STATE3 + AESDECLAST KEY STATE4 ret /* -- cgit v1.2.2 From be831297716036de5b24308447ecb69f1706a846 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 12:50:00 +0100 Subject: x86/amd-iommu: attach devices to pre-allocated domains early For some devices the ACPI table may define unity map requirements which must me met when the IOMMU is enabled. So we need to attach devices to their domains as early as possible so that these mappings are in place when needed. This patch assigns the domains right after they are allocated. Otherwise this can result in I/O page faults before a driver binds to a device and BIOS is still using it. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 093bd526c949..b74b21247584 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2047,10 +2047,10 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - u16 devid; + u16 devid, __devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); + __devid = devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; @@ -2065,6 +2065,10 @@ static void prealloc_protection_domains(void) init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; + attach_device(iommu, &dma_dom->domain, devid); + if (__devid != devid) + attach_device(iommu, &dma_dom->domain, __devid); + list_add_tail(&dma_dom->list, &iommu_pd_list); } } -- cgit v1.2.2 From 564ec0ec05ac6ee409bde81f7ef27a3dadbf3a6a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 23 Nov 2009 19:55:22 +0800 Subject: crypto: ghash-clmulni-intel - Use gas macro for PCLMULQDQ-NI and PSHUFB Old binutils do not support PCLMULQDQ-NI and PSHUFB, to make kernel can be compiled by them, .byte code is used instead of assembly instructions. But the readability and flexibility of raw .byte code is not good. So corresponding assembly instruction like gas macro is used instead. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_asm.S | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 59584982fb75..1528dc4886cf 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -17,7 +17,7 @@ */ #include -#include +#include .align 16 .Lbswap_mask: @@ -56,12 +56,9 @@ __clmul_gf128mul_ble: pxor DATA, T2 pxor SHASH, T3 - # pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 - .byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00 - # pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 - .byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11 - # pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) - .byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00 + PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 + PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 + PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) pxor DATA, T2 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 @@ -101,11 +98,9 @@ ENTRY(clmul_ghash_mul) movups (%rdi), DATA movups (%rsi), SHASH movaps .Lbswap_mask, BSWAP - # pshufb BSWAP, DATA - PSHUFB_XMM5_XMM0 + PSHUFB_XMM BSWAP DATA call __clmul_gf128mul_ble - # pshufb BSWAP, DATA - .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 + PSHUFB_XMM BSWAP DATA movups DATA, (%rdi) ret @@ -119,21 +114,18 @@ ENTRY(clmul_ghash_update) movaps .Lbswap_mask, BSWAP movups (%rdi), DATA movups (%rcx), SHASH - # pshufb BSWAP, DATA - PSHUFB_XMM5_XMM0 + PSHUFB_XMM BSWAP DATA .align 4 .Lupdate_loop: movups (%rsi), IN1 - # pshufb BSWAP, IN1 - PSHUFB_XMM5_XMM6 + PSHUFB_XMM BSWAP IN1 pxor IN1, DATA call __clmul_gf128mul_ble sub $16, %rdx add $16, %rsi cmp $16, %rdx jge .Lupdate_loop - # pshufb BSWAP, DATA - PSHUFB_XMM5_XMM0 + PSHUFB_XMM BSWAP DATA movups DATA, (%rdi) .Lupdate_just_ret: ret @@ -146,8 +138,7 @@ ENTRY(clmul_ghash_update) ENTRY(clmul_ghash_setkey) movaps .Lbswap_mask, BSWAP movups (%rsi), %xmm0 - # pshufb BSWAP, %xmm0 - PSHUFB_XMM5_XMM0 + PSHUFB_XMM BSWAP %xmm0 movaps %xmm0, %xmm1 psllq $1, %xmm0 psrlq $63, %xmm1 -- cgit v1.2.2 From 68ee87164e73f68cf09070043c97e7f61e6966d4 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 23 Nov 2009 20:19:47 +0800 Subject: crypto: ghash-clmulni-intel - Put proper .data section in place Lbswap_mask, Lpoly and Ltwo_one should clearly belong to .data section, not .text. Signed-off-by: Jiri Kosina Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_asm.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 1528dc4886cf..1eb7f90cb7b9 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -19,6 +19,8 @@ #include #include +.data + .align 16 .Lbswap_mask: .octa 0x000102030405060708090a0b0c0d0e0f -- cgit v1.2.2 From ba6909b719a5ccc0c8100d2895bb7ff557b2eeae Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 23 Nov 2009 21:17:13 +0530 Subject: hw-breakpoint: Attribute authorship of hw-breakpoint related files Attribute authorship to developers of hw-breakpoint related files. Signed-off-by: K.Prasad Cc: Alan Stern Cc: Frederic Weisbecker LKML-Reference: <20091123154713.GA5593@in.ibm.com> [ v2: moved it to latest -tip ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 752daebe91c6..4d267fb77828 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -16,6 +16,10 @@ * Copyright (C) 2007 Alan Stern * Copyright (C) 2009 IBM Corporation * Copyright (C) 2009 Frederic Weisbecker + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker */ /* -- cgit v1.2.2 From e38e2af1c57c3eb5211331a5b4fcaae0c4a2a918 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 19 Nov 2009 17:12:43 -0600 Subject: x86: SGI UV: Fix BAU initialization A memory mapped register that affects the SGI UV Broadcast Assist Unit's interrupt handling may sometimes be unintialized. Remove the condition on its initialization, as that condition can be randomly satisfied by a hardware reset. Signed-off-by: Cliff Wickman Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 503c1f2e8835..af21e5556900 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -819,10 +819,8 @@ static int __init uv_init_blade(int blade) */ apicid = blade_to_first_apicid(blade); pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); - if ((pa & 0xff) != UV_BAU_MESSAGE) { - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, ((apicid << 32) | UV_BAU_MESSAGE)); - } return 0; } -- cgit v1.2.2 From 0444c9bd0cf4e0eb946a7fcaf34765accfa9404a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 20 Nov 2009 14:03:05 +0000 Subject: x86: Tighten conditionals on MCE related statistics irq_thermal_count is only being maintained when X86_THERMAL_VECTOR, and both X86_THERMAL_VECTOR and X86_MCE_THRESHOLD don't need extra wrapping in X86_MCE conditionals. Signed-off-by: Jan Beulich Cc: Hidetoshi Seto Cc: Yong Wang Cc: Suresh Siddha Cc: Andi Kleen Cc: Borislav Petkov Cc: Arjan van de Ven LKML-Reference: <4B06AFA902000078000211F8@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hardirq.h | 6 +++--- arch/x86/kernel/irq.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 82e3e8f01043..108eb6fd1ae7 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -20,11 +20,11 @@ typedef struct { unsigned int irq_call_count; unsigned int irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; -# endif #endif } ____cacheline_aligned irq_cpustat_t; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 04bbd5278568..19212cb01558 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); -# endif #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); @@ -194,11 +194,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_call_count; sum += irq_stats(cpu)->irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; -# endif #endif #ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); -- cgit v1.2.2 From 581f202bcd60acbc3af1f5faa429e570c512f8a3 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Fri, 20 Nov 2009 15:48:26 -0600 Subject: x86: UV RTC: Always enable RTC clocksource Always enable the RTC clocksource on UV systems. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091120214826.GA20016@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 3da7b1d8bfd3..3c84aa001c11 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -74,7 +74,6 @@ struct uv_rtc_timer_head { */ static struct uv_rtc_timer_head **blade_info __read_mostly; -static int uv_rtc_enable; static int uv_rtc_evt_enable; /* @@ -335,14 +334,6 @@ static void uv_rtc_interrupt(void) ced->event_handler(ced); } -static int __init uv_enable_rtc(char *str) -{ - uv_rtc_enable = 1; - - return 1; -} -__setup("uvrtc", uv_enable_rtc); - static int __init uv_enable_evt_rtc(char *str) { uv_rtc_evt_enable = 1; @@ -364,12 +355,16 @@ static __init int uv_rtc_setup_clock(void) { int rc; - if (!uv_rtc_enable || !is_uv_system() || x86_platform_ipi_callback) + if (!is_uv_system()) return -ENODEV; clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, clocksource_uv.shift); + /* If single blade, prefer tsc */ + if (uv_num_possible_blades() == 1) + clocksource_uv.rating = 250; + rc = clocksource_register(&clocksource_uv); if (rc) printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); @@ -377,7 +372,7 @@ static __init int uv_rtc_setup_clock(void) printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", sn_rtc_cycles_per_second/(unsigned long)1E6); - if (rc || !uv_rtc_evt_enable) + if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback) return rc; /* Setup and register clockevents */ -- cgit v1.2.2 From fd12a0d69aee6d90fa9b9890db24368a897f8423 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 19 Nov 2009 14:23:41 -0600 Subject: x86: UV SGI: Don't track GRU space in PAT GRU space is always mapped as WB in the page table. There is no need to track the mappings in the PAT. This also eliminates the "freeing invalid memtype" messages when the GRU space is unmapped. Signed-off-by: Jack Steiner LKML-Reference: <20091119202341.GA4420@sgi.com> [ v2: fix build failure ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 2 ++ arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/x2apic_uv_x.c | 19 ++++++++++++++++++- arch/x86/kernel/x86_init.c | 2 ++ arch/x86/mm/pat.c | 12 +++++++++--- 5 files changed, 33 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index e2c1668dde7a..4c35dd016b54 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -24,4 +24,6 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end, void io_free_memtype(resource_size_t start, resource_size_t end); +int default_is_untracked_pat_range(u64 start, u64 end); + #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2c756fd4ab0e..8112ed786287 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -113,11 +113,13 @@ struct x86_cpuinit_ops { /** * struct x86_platform_ops - platform specific runtime functions + * @is_untracked_pat_range exclude from PAT logic * @calibrate_tsc: calibrate TSC * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock */ struct x86_platform_ops { + int (*is_untracked_pat_range)(u64 start, u64 end); unsigned long (*calibrate_tsc)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5f5886a6b53..2477c9f88093 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -30,10 +30,22 @@ #include #include #include +#include DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; +static u64 gru_start_paddr, gru_end_paddr; + +static int is_GRU_range(u64 start, u64 end) +{ + return start >= gru_start_paddr && end < gru_end_paddr; +} + +static int uv_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end) || is_GRU_range(start, end); +} static int early_get_nodeid(void) { @@ -49,6 +61,7 @@ static int early_get_nodeid(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) @@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode) int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); - if (gru.s.enable) + if (gru.s.enable) { map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)gru.s.base << shift); + gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); + + } } static __init void map_mmr_high(int max_pnode) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4449a4a2c2ed..bcc749ef62dc 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -13,6 +13,7 @@ #include #include #include +#include #include void __cpuinit x86_init_noop(void) { } @@ -69,6 +70,7 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { }; struct x86_platform_ops x86_platform = { + .is_untracked_pat_range = default_is_untracked_pat_range, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e78cd0ec2bcf..38a66ef9426d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -348,6 +349,11 @@ static int free_ram_pages_type(u64 start, u64 end) return 0; } +int default_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end); +} + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -388,7 +394,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (is_ISA_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end - 1)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; @@ -499,7 +505,7 @@ int free_memtype(u64 start, u64 end) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (is_ISA_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end - 1)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); @@ -582,7 +588,7 @@ static unsigned long lookup_memtype(u64 paddr) int rettype = _PAGE_CACHE_WB; struct memtype *entry; - if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE - 1)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { -- cgit v1.2.2 From 27c13ecec4d8856687b50b959e1146845b478f95 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 21 Nov 2009 14:01:45 +0100 Subject: x86, cpu: mv display_cacheinfo -> cpu_detect_cache_sizes display_cacheinfo() doesn't display anything anymore and it is used to detect CPU cache sizes. Rename it accordingly. Signed-off-by: Borislav Petkov LKML-Reference: <20091121130145.GA31357@liondog.tnic> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/centaur.c | 2 +- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/cpu/cpu.h | 2 +- arch/x86/kernel/cpu/cyrix.c | 2 +- arch/x86/kernel/cpu/transmeta.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c910a716a71c..7128b3799cec 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -535,7 +535,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } - display_cacheinfo(c); + cpu_detect_cache_sizes(c); /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) { diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index c95e831bb095..e58d978e0758 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); } - display_cacheinfo(c); + cpu_detect_cache_sizes(c); } enum { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 61242a56c2d6..9bf845dc8055 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void) static void __cpuinit default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 - display_cacheinfo(c); + cpu_detect_cache_sizes(c); #else /* Not much we can do here... */ /* Check if at least it has cpuid */ @@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) } } -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 6de9a908e400..3624e8a0f71b 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -32,6 +32,6 @@ struct cpu_dev { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; -extern void display_cacheinfo(struct cpuinfo_x86 *c); +extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 19807b89f058..4fbd384fb645 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c) /* Handle the GX (Formally known as the GX2) */ if (c->x86 == 5 && c->x86_model == 5) - display_cacheinfo(c); + cpu_detect_cache_sizes(c); else init_cyrix(c); } diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index bb62b3e5caad..28000743bbb0 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) early_init_transmeta(c); - display_cacheinfo(c); + cpu_detect_cache_sizes(c); /* Print CMS and CPU revision */ max = cpuid_eax(0x80860000); -- cgit v1.2.2 From 55a6ca25472ee01574bfc24d23b7f5fa09cc38cf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Nov 2009 15:12:07 -0800 Subject: x86, mm: Call is_untracked_pat_range() rather than is_ISA_range() Checkin fd12a0d69aee6d90fa9b9890db24368a897f8423 made the PAT untracked range a platform configurable, but missed on occurrence of is_ISA_range() which still refers to PAT-untracked memory, and therefore should be using the configurable. Signed-off-by: H. Peter Anvin Cc: Jack Steiner Cc: Suresh Siddha LKML-Reference: <20091119202341.GA4420@sgi.com> --- arch/x86/include/asm/pgtable.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index af6fd360ab35..1de2094d2e57 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -16,6 +16,8 @@ #ifndef __ASSEMBLY__ +#include + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, unsigned long new_flags) { /* - * PAT type is always WB for ISA. So no need to check. + * PAT type is always WB for untracked ranges, so no need to check. */ - if (is_ISA_range(paddr, paddr + size - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + size - 1)) return 1; /* -- cgit v1.2.2 From 8a27138924f64d2f30c1022f909f74480046bc3f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Nov 2009 14:49:20 -0800 Subject: x86, mm: is_untracked_pat_range() takes a normal semiclosed range is_untracked_pat_range() -- like its components, is_ISA_range() and is_GRU_range(), takes a normal semiclosed interval (>=, <) whereas the PAT code called it as if it took a closed range (>=, <=). Fix. Although this is a bug, I believe it is non-manifest, simply because none of the callers will call this with non-page-aligned addresses. Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner Acked-by: Suresh Siddha LKML-Reference: <20091119202341.GA4420@sgi.com> --- arch/x86/include/asm/pgtable.h | 2 +- arch/x86/mm/pat.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1de2094d2e57..a34c785c5a63 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -274,7 +274,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, /* * PAT type is always WB for untracked ranges, so no need to check. */ - if (x86_platform.is_untracked_pat_range(paddr, paddr + size - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + size)) return 1; /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 38a66ef9426d..b5bc08cfcea6 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -394,7 +394,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (x86_platform.is_untracked_pat_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; @@ -505,7 +505,7 @@ int free_memtype(u64 start, u64 end) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (x86_platform.is_untracked_pat_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); @@ -588,7 +588,7 @@ static unsigned long lookup_memtype(u64 paddr) int rettype = _PAGE_CACHE_WB; struct memtype *entry; - if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { -- cgit v1.2.2 From 65f116f5f16dc3371fce24fb24bc4843b5380ba5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Nov 2009 14:44:39 -0800 Subject: x86: Change is_ISA_range() into an inline function Change is_ISA_range() from a macro to an inline function. This makes it type safe, and also allows it to be assigned to a function pointer if necessary. Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner LKML-Reference: <20091119202341.GA4420@sgi.com> --- arch/x86/include/asm/e820.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 40b4e614fe71..68b4e0ec1950 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -61,6 +61,12 @@ struct e820map { struct e820entry map[E820_X_MAX]; }; +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 + +#define BIOS_BEGIN 0x000a0000 +#define BIOS_END 0x00100000 + #ifdef __KERNEL__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map e820; @@ -126,15 +132,14 @@ extern void e820_reserve_resources(void); extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); -#endif /* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 -#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) +static inline bool is_ISA_range(u64 s, u64 e) +{ + return s >= ISA_START_ADDRESS && e < ISA_END_ADDRESS; +} -#define BIOS_BEGIN 0x000a0000 -#define BIOS_END 0x00100000 +#endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ #ifdef __KERNEL__ #include -- cgit v1.2.2 From eb41c8be89dbe079f49202774e04a79ccac48a09 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Nov 2009 14:46:07 -0800 Subject: x86, platform: Change is_untracked_pat_range() to bool; cleanup init - Change is_untracked_pat_range() to return bool. - Clean up the initialization of is_untracked_pat_range() -- by default, we simply point it at is_ISA_range() directly. - Move is_untracked_pat_range to the end of struct x86_platform, since it is the newest field. Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner Cc: Jack Steiner LKML-Reference: <20091119202341.GA4420@sgi.com> --- arch/x86/include/asm/pat.h | 2 -- arch/x86/include/asm/x86_init.h | 4 ++-- arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++-- arch/x86/kernel/x86_init.c | 2 +- arch/x86/mm/pat.c | 5 ----- 5 files changed, 5 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 4c35dd016b54..e2c1668dde7a 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -24,6 +24,4 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end, void io_free_memtype(resource_size_t start, resource_size_t end); -int default_is_untracked_pat_range(u64 start, u64 end); - #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8112ed786287..024cf3c1fd82 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -113,16 +113,16 @@ struct x86_cpuinit_ops { /** * struct x86_platform_ops - platform specific runtime functions - * @is_untracked_pat_range exclude from PAT logic * @calibrate_tsc: calibrate TSC * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock + * @is_untracked_pat_range exclude from PAT logic */ struct x86_platform_ops { - int (*is_untracked_pat_range)(u64 start, u64 end); unsigned long (*calibrate_tsc)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); + bool (*is_untracked_pat_range)(u64 start, u64 end); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2477c9f88093..597a47b1cec6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -37,12 +37,12 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; -static int is_GRU_range(u64 start, u64 end) +static inline bool is_GRU_range(u64 start, u64 end) { return start >= gru_start_paddr && end < gru_end_paddr; } -static int uv_is_untracked_pat_range(u64 start, u64 end) +static bool uv_is_untracked_pat_range(u64 start, u64 end) { return is_ISA_range(start, end) || is_GRU_range(start, end); } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index bcc749ef62dc..861b8b54e172 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -70,8 +70,8 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { }; struct x86_platform_ops x86_platform = { - .is_untracked_pat_range = default_is_untracked_pat_range, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, + .is_untracked_pat_range = is_ISA_range, }; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index b5bc08cfcea6..ef712518b5b4 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -349,11 +349,6 @@ static int free_ram_pages_type(u64 start, u64 end) return 0; } -int default_is_untracked_pat_range(u64 start, u64 end) -{ - return is_ISA_range(start, end); -} - /* * req_type typically has one of the: * - _PAGE_CACHE_WB -- cgit v1.2.2 From b24c2a925a9837cccf54d50aeac22ba0cbc15455 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Nov 2009 02:48:18 -0800 Subject: x86: Move find_smp_config() earlier and avoid bootmem usage Move the find_smp_config() call to before bootmem is initialized. Use reserve_early() instead of reserve_bootmem() in it. This simplifies the code, we only need to call find_smp_config() once and can remove the now unneeded reserve parameter from x86_init_mpparse::find_smp_config. We thus also reduce x86's dependency on bootmem allocations. Signed-off-by: Yinghai Lu LKML-Reference: <4B0BB9F2.70907@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpspec.h | 11 +++-------- arch/x86/include/asm/x86_init.h | 2 +- arch/x86/kernel/apic/numaq_32.c | 5 ----- arch/x86/kernel/mpparse.c | 44 +++++++++++------------------------------ arch/x86/kernel/setup.c | 10 +++++----- arch/x86/kernel/visws_quirks.c | 2 +- arch/x86/mm/k8topology_64.c | 12 ----------- 7 files changed, 22 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 79c94500c0bb..644cf1a50bfd 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -71,12 +71,7 @@ static inline void early_get_smp_config(void) static inline void find_smp_config(void) { - x86_init.mpparse.find_smp_config(1); -} - -static inline void early_find_smp_config(void) -{ - x86_init.mpparse.find_smp_config(0); + x86_init.mpparse.find_smp_config(); } #ifdef CONFIG_X86_MPPARSE @@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); # else # define default_mpc_oem_bus_info NULL # endif -extern void default_find_smp_config(unsigned int reserve); +extern void default_find_smp_config(void); extern void default_get_smp_config(unsigned int early); #else static inline void early_reserve_e820_mpc_new(void) { } @@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { } #define default_mpc_apic_id NULL #define default_smp_read_mpc_oem NULL #define default_mpc_oem_bus_info NULL -#define default_find_smp_config x86_init_uint_noop +#define default_find_smp_config x86_init_noop #define default_get_smp_config x86_init_uint_noop #endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 024cf3c1fd82..97e5fb4f3bd3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -26,7 +26,7 @@ struct x86_init_mpparse { void (*smp_read_mpc_oem)(struct mpc_table *mpc); void (*mpc_oem_pci_bus)(struct mpc_bus *m); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); - void (*find_smp_config)(unsigned int reserve); + void (*find_smp_config)(void); void (*get_smp_config)(unsigned int early); }; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index efa00e2b8505..9c0629ceb528 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -263,11 +263,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static __init void early_check_numaq(void) { - /* - * Find possible boot-time SMP configuration: - */ - early_find_smp_config(); - /* * get boot-time SMP configuration: */ diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 5be95ef4ffec..35a57c963df9 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early) */ } -static void __init smp_reserve_bootmem(struct mpf_intel *mpf) +static void __init smp_reserve_memory(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); -#ifdef CONFIG_X86_32 - /* - * We cannot access to MPC table to compute table size yet, - * as only few megabytes from the bottom is mapped now. - * PC-9800's MPC table places on the very last of physical - * memory; so that simply reserving PAGE_SIZE from mpf->physptr - * yields BUG() in reserve_bootmem. - * also need to make sure physptr is below than max_low_pfn - * we don't need reserve the area above max_low_pfn - */ - unsigned long end = max_low_pfn * PAGE_SIZE; - if (mpf->physptr < end) { - if (mpf->physptr + size > end) - size = end - mpf->physptr; - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); - } -#else - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); -#endif + reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); } -static int __init smp_scan_config(unsigned long base, unsigned long length, - unsigned reserve) +static int __init smp_scan_config(unsigned long base, unsigned long length) { unsigned int *bp = phys_to_virt(base); struct mpf_intel *mpf; + unsigned long mem; apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", bp, length); @@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", mpf, (u64)virt_to_phys(mpf)); - if (!reserve) - return 1; - reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), - BOOTMEM_DEFAULT); + mem = virt_to_phys(mpf); + reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); if (mpf->physptr) - smp_reserve_bootmem(mpf); + smp_reserve_memory(mpf); return 1; } @@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -void __init default_find_smp_config(unsigned int reserve) +void __init default_find_smp_config(void) { unsigned int address; @@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve) * 2) Scan the top 1K of base RAM * 3) Scan the 64K of bios */ - if (smp_scan_config(0x0, 0x400, reserve) || - smp_scan_config(639 * 0x400, 0x400, reserve) || - smp_scan_config(0xF0000, 0x10000, reserve)) + if (smp_scan_config(0x0, 0x400) || + smp_scan_config(639 * 0x400, 0x400) || + smp_scan_config(0xF0000, 0x10000)) return; /* * If it is an SMP machine we should know now, unless the @@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve) address = get_bios_ebda(); if (address) - smp_scan_config(address, 0x400, reserve); + smp_scan_config(address, 0x400); } #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e3eae5965e4a..cdb6a8a506dd 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -913,6 +913,11 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. @@ -927,11 +932,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn, acpi, k8); - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); - #ifdef CONFIG_X86_64 /* * dma32_reserve_bootmem() allocates bootmem which may conflict diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index f068553a1b17..1498efa964b6 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) apic_version[m->apicid] = ver; } -static void __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(void) { struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index b9e2dbfe55c3..970ed579d4e4 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -57,18 +57,6 @@ static __init void early_get_boot_cpu_id(void) * need to get boot_cpu_id so can use that to create apicid_to_node * in k8_scan_nodes() */ - /* - * Find possible boot-time SMP configuration: - */ -#ifdef CONFIG_X86_MPPARSE - early_find_smp_config(); -#endif -#ifdef CONFIG_ACPI - /* - * Read APIC information from ACPI tables. - */ - early_acpi_boot_init(); -#endif #ifdef CONFIG_X86_MPPARSE /* * get boot-time SMP configuration: -- cgit v1.2.2 From 1261a02a0c0ab8e643125705f0d1d83e5090e4d1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 24 Nov 2009 05:27:18 -0800 Subject: perf_events, x86: Fix validate_event bug The validate_event() was failing on valid event combinations. The function was assuming that if x86_schedule_event() returned 0, it meant error. But x86_schedule_event() returns the counter index and 0 is a perfectly valid value. An error is returned if the function returns a negative value. Furthermore, validate_event() was also failing for event groups because the event->pmu was not set until after hw_perf_event_init(). Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: paulus@samba.org Cc: perfmon2-devel@lists.sourceforge.net Cc: eranian@gmail.com LKML-Reference: <4b0bdf36.1818d00a.07cc.25ae@mx.google.com> Signed-off-by: Ingo Molnar -- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bd8743024204..c1bbed1021d9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2229,10 +2229,10 @@ validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) { struct hw_perf_event fake_event = event->hw; - if (event->pmu != &pmu) + if (event->pmu && event->pmu != &pmu) return 0; - return x86_schedule_event(cpuc, &fake_event); + return x86_schedule_event(cpuc, &fake_event) >= 0; } static int validate_group(struct perf_event *event) -- cgit v1.2.2 From b8cbe7e82ec8b55d7bbdde66fc69e788fde00dc6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:57:56 +1030 Subject: [CPUFREQ] cpumask: don't put a cpumask on the stack in x86...cpufreq/powernow-k8.c It's still mugging the current process's cpumask, but as comment in 1ff6e97f1d says, it's not a trivial fix. So, at least we can use a cpumask_var_t to do the Wrong Thing the Right Way :) Signed-off-by: Rusty Russell To: cpufreq@vger.kernel.org Cc: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3f12dabeab52..f30d25383940 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { - cpumask_t oldmask; + cpumask_var_t oldmask; struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; @@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol, checkfid = data->currfid; checkvid = data->currvid; - /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); + /* only run on specific CPU from here on. */ + /* This is poor form: use a workqueue or smp_call_function_single */ + if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(oldmask, tsk_cpumask(current)); + set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol, ret = 0; err_out: - set_cpus_allowed_ptr(current, &oldmask); + set_cpus_allowed_ptr(current, oldmask); + free_cpumask_var(oldmask); return ret; } -- cgit v1.2.2 From db2820dd5445a44b4726f15a2bc89b9ded2503eb Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Sun, 25 Oct 2009 19:45:57 +0100 Subject: [CPUFREQ] powernow-k6: set transition latency value so ondemand governor can be used Set the transition latency to value smaller than CPUFREQ_ETERNAL so governors other than "performance" work (like the "ondemand" one). The value is found in "AMD PowerNow! Technology Platform Design Guide for Embedded Processors" dated December 2000 (AMD doc #24267A). There is the answer to one of FAQs on page 40 which states that suggested complete transition period is 200 us. Tested on K6-2+ CPU with K6-3 core (model 13, stepping 4). Signed-off-by: Krzysztof Helt Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index f10dea409f40..cb01dac267d3 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) } /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cpuinfo.transition_latency = 200000; policy->cur = busfreq * max_multiplier; result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); -- cgit v1.2.2 From 1cce76c2ac60df40b02bf747982fb3f00e68f50a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 17 Nov 2009 14:39:53 -0800 Subject: [CPUFREQ] use an enum for speedstep processor identification The "unsigned int processor" everywhere confused Rusty, leading to breakage when he passed in smp_processor_id(). Signed-off-by: Rusty Russell Acked-by: Dominik Brodowski Signed-off-by: Andrew Morton Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 2 +- arch/x86/kernel/cpu/cpufreq/speedstep-lib.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/speedstep-lib.h | 24 ++++++++++++------------ arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 3ae5a7a3a500..2ce8e0b5cc54 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev; /* speedstep_processor */ -static unsigned int speedstep_processor; +static enum speedstep_processor speedstep_processor; static u32 pmbase; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index f4c290b8482f..ad0083abfa23 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -34,7 +34,7 @@ static int relaxed_check; * GET PROCESSOR CORE SPEED IN KHZ * *********************************************************************/ -static unsigned int pentium3_get_frequency(unsigned int processor) +static unsigned int pentium3_get_frequency(enum speedstep_processor processor) { /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ struct { @@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void) /* Warning: may get called from smp_call_function_single. */ -unsigned int speedstep_get_frequency(unsigned int processor) +unsigned int speedstep_get_frequency(enum speedstep_processor processor) { switch (processor) { case SPEEDSTEP_CPU_PCORE: @@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor); * DETECT SPEEDSTEP SPEEDS * *********************************************************************/ -unsigned int speedstep_get_freqs(unsigned int processor, +unsigned int speedstep_get_freqs(enum speedstep_processor processor, unsigned int *low_speed, unsigned int *high_speed, unsigned int *transition_latency, diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h index 2b6c04e5a304..70d9cea1219d 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h @@ -11,18 +11,18 @@ /* processors */ - -#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ -#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ -#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ -#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ - +enum speedstep_processor { + SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ + SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ /* the following processors are not speedstep-capable and are not auto-detected * in speedstep_detect_processor(). However, their speed can be detected using * the speedstep_get_frequency() call. */ -#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ -#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ -#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ + SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ + SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ + SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ +}; /* speedstep states -- only two of them */ @@ -31,10 +31,10 @@ /* detect a speedstep-capable processor */ -extern unsigned int speedstep_detect_processor (void); +extern enum speedstep_processor speedstep_detect_processor(void); /* detect the current speed (in khz) of the processor */ -extern unsigned int speedstep_get_frequency(unsigned int processor); +extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); /* detect the low and high speeds of the processor. The callback @@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor); * SPEEDSTEP_LOW; the second argument is zero so that no * cpufreq_notify_transition calls are initiated. */ -extern unsigned int speedstep_get_freqs(unsigned int processor, +extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, unsigned int *low_speed, unsigned int *high_speed, unsigned int *transition_latency, diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index befea088e4f5..04d73c114e49 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -35,7 +35,7 @@ static int smi_cmd; static unsigned int smi_sig; /* info about the processor */ -static unsigned int speedstep_processor; +static enum speedstep_processor speedstep_processor; /* * There are only two frequency states for each processor. Values -- cgit v1.2.2 From e2f74f355e9e2914483db10c05d70e69e0b7ae04 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Thu, 19 Nov 2009 12:31:01 +0100 Subject: [ACPI/CPUFREQ] Introduce bios_limit per cpu cpufreq sysfs interface This interface is mainly intended (and implemented) for ACPI _PPC BIOS frequency limitations, but other cpufreq drivers can also use it for similar use-cases. Why is this needed: Currently it's not obvious why cpufreq got limited. People see cpufreq/scaling_max_freq reduced, but this could have happened by: - any userspace prog writing to scaling_max_freq - thermal limitations - hardware (_PPC in ACPI case) limitiations Therefore export bios_limit (in kHz) to: - Point the user that it's the BIOS (broken or intended) which limits frequency - Export it as a sysfs interface for userspace progs. While this was a rarely used feature on laptops, there will appear more and more server implemenations providing "Green IT" features like allowing the service processor to limit the frequency. People want to know about HW/BIOS frequency limitations. All ACPI P-state driven cpufreq drivers are covered with this patch: - powernow-k8 - powernow-k7 - acpi-cpufreq Tested with a patched DSDT which limits the first two cores (_PPC returns 1) via _PPC, exposed by bios_limit: # echo 2200000 >cpu2/cpufreq/scaling_max_freq # cat cpu*/cpufreq/scaling_max_freq 2600000 2600000 2200000 2200000 # #scaling_max_freq shows general user/thermal/BIOS limitations # cat cpu*/cpufreq/bios_limit 2600000 2600000 2800000 2800000 # #bios_limit only shows the HW/BIOS limitation CC: Pallipadi Venkatesh CC: Len Brown CC: davej@codemonkey.org.uk CC: linux@dominikbrodowski.net Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 17 +++++++++-------- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 19 +++++++++++-------- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 17 +++++++++-------- 3 files changed, 29 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8b581d3905cb..d2e7c77c1ea4 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -764,14 +764,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = { }; static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .resume = acpi_cpufreq_resume, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, + .verify = acpi_cpufreq_verify, + .target = acpi_cpufreq_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = acpi_cpufreq_cpu_init, + .exit = acpi_cpufreq_cpu_exit, + .resume = acpi_cpufreq_resume, + .name = "acpi-cpufreq", + .owner = THIS_MODULE, + .attr = acpi_cpufreq_attr, }; static int __init acpi_cpufreq_init(void) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index d47c775eb0ab..9a97116f89e5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = { }; static struct cpufreq_driver powernow_driver = { - .verify = powernow_verify, - .target = powernow_target, - .get = powernow_get, - .init = powernow_cpu_init, - .exit = powernow_cpu_exit, - .name = "powernow-k7", - .owner = THIS_MODULE, - .attr = powernow_table_attr, + .verify = powernow_verify, + .target = powernow_target, + .get = powernow_get, +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + .bios_limit = acpi_processor_get_bios_limit, +#endif + .init = powernow_cpu_init, + .exit = powernow_cpu_exit, + .name = "powernow-k7", + .owner = THIS_MODULE, + .attr = powernow_table_attr, }; static int __init powernow_init(void) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f30d25383940..a9df9441a9a2 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1398,14 +1398,15 @@ static struct freq_attr *powernow_k8_attr[] = { }; static struct cpufreq_driver cpufreq_amd64_driver = { - .verify = powernowk8_verify, - .target = powernowk8_target, - .init = powernowk8_cpu_init, - .exit = __devexit_p(powernowk8_cpu_exit), - .get = powernowk8_get, - .name = "powernow-k8", - .owner = THIS_MODULE, - .attr = powernow_k8_attr, + .verify = powernowk8_verify, + .target = powernowk8_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = powernowk8_cpu_init, + .exit = __devexit_p(powernowk8_cpu_exit), + .get = powernowk8_get, + .name = "powernow-k8", + .owner = THIS_MODULE, + .attr = powernow_k8_attr, }; /* driver entry point for init */ -- cgit v1.2.2 From 5bf65b9ba67226eae9ffc398a0369fc4da35c259 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Nov 2009 02:46:59 -0800 Subject: x86, mtrr: Fix sorting of mtrr after subtracting In some cases we can coalesce MTRR entries after cleanup; this may allow us to have more entries. As such, introduce clean_sort_range to to sort and coaelsce the MTRR entries. Signed-off-by: Yinghai Lu LKML-Reference: <4B0BB9A3.5020908@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 49 +++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 6e49f6f91f31..6987af786c02 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } +static int __init clean_sort_range(struct res_range *range, int az) +{ + int i, j, k = az - 1, nr_range = 0; + + for (i = 0; i < k; i++) { + if (range[i].end) + continue; + for (j = k; j > i; j--) { + if (range[j].end) { + k = j; + break; + } + } + if (j == i) + break; + range[i].start = range[k].start; + range[i].end = range[k].end; + range[k].start = 0; + range[k].end = 0; + k--; + } + /* count it */ + for (i = 0; i < az; i++) { + if (!range[i].end) { + nr_range = i; + break; + } + } + + /* sort them */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + + return nr_range; +} + #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" @@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, subtract_range(range, extra_remove_base, extra_remove_base + extra_remove_size - 1); - /* get new range num */ - nr_range = 0; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - nr_range++; - } if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); - for (i = 0; i < nr_range; i++) + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + } } /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + nr_range = clean_sort_range(range, RANGE_NUM); if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) -- cgit v1.2.2 From 2ed7a806d864bde5903b73da1c65b0316b21efd3 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Mon, 16 Nov 2009 14:21:13 -0700 Subject: x86/PCI: remove early PCI pr_debug statements commit db635adc turned -DDEBUG for x86/pci on when CONFIG_PCI_DEBUG is set. In general, I agree with that change. However, it exposes a bunch of very low level PCI debugging in the early x86 path, such as: 0 reading 2 from a: ffff 1 reading 2 from a: ffff 2 reading 2 from a: ffff 3 reading 2 from a: 300 3 reading 2 from 0: 1002 3 reading 2 from 2: 515e These statements add a lot of noise to the boot and aren't likely to be necessary even when handling random upstream bug reports. [In contrast, statements such as these: pci 0000:02:04.0: found [14e4:164a] class 000200 header type 00 pci 0000:02:04.0: reg 10: [mem 0xf8000000-0xf9ffffff 64bit] pci 0000:02:04.0: reg 30: [mem 0x00000000-0x0001ffff pref] are indeed useful when remote debugging users' machines] Remove the noisy printks and save electrons everywhere. Cc: Bjorn Helgaas Cc: Yinghai Lu Cc: Andi Kleen Cc: Ingo Molnar Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- arch/x86/pci/early.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index aaf26ae58cd5..d1067d539bee 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -12,8 +12,6 @@ u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) u32 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inl(0xcfc); - if (v != 0xffffffff) - pr_debug("%x reading 4 from %x: %x\n", slot, offset, v); return v; } @@ -22,7 +20,6 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) u8 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inb(0xcfc + (offset&3)); - pr_debug("%x reading 1 from %x: %x\n", slot, offset, v); return v; } @@ -31,28 +28,24 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) u16 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inw(0xcfc + (offset&2)); - pr_debug("%x reading 2 from %x: %x\n", slot, offset, v); return v; } void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outl(val, 0xcfc); } void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outb(val, 0xcfc + (offset&3)); } void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outw(val, 0xcfc + (offset&2)); } -- cgit v1.2.2 From 7b7a78594292d540720485544ad1043b71de14e0 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 17 Nov 2009 23:19:53 +0100 Subject: PCI: fix comment typo in bus_numa.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: André Goddard Rosa Signed-off-by: Jiri Kosina Signed-off-by: Jesse Barnes --- arch/x86/pci/bus_numa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index 4ff126a3e887..730369f392a3 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -2,7 +2,7 @@ /* * sub bus (transparent) will use entres from 3 to store extra from - * root, so need to make sure we have enought slot there, Should we + * root, so need to make sure we have enough slot there, Should we * increase PCI_BUS_NUM_RESOURCES? */ #define RES_NUM 16 -- cgit v1.2.2 From 67f241f4579651ea4335b58967c8880c0a378249 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Nov 2009 22:27:40 -0800 Subject: x86/pci: seperate x86_pci_rootbus_res_quirks from amd_bus.c Those functions are used by intel_bus.c so seperate them to another file. and make amd_bus a bit smaller. Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/Makefile | 2 +- arch/x86/pci/amd_bus.c | 99 ----------------------------------------------- arch/x86/pci/bus_numa.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/pci/bus_numa.h | 1 + 4 files changed, 103 insertions(+), 100 deletions(-) create mode 100644 arch/x86/pci/bus_numa.c (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index d8a0a6279a4d..564b008a51c7 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o obj-y += amd_bus.o -obj-$(CONFIG_X86_64) += intel_bus.o +obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o ifeq ($(CONFIG_PCI_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 995f36096a42..95ecbd495955 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -6,8 +6,6 @@ #ifdef CONFIG_X86_64 #include -#include -#include #endif #include "bus_numa.h" @@ -19,54 +17,6 @@ #ifdef CONFIG_X86_64 -int pci_root_num; -struct pci_root_info pci_root_info[PCI_ROOT_NR]; -static int found_all_numa_early; - -void x86_pci_root_bus_res_quirks(struct pci_bus *b) -{ - int i; - int j; - struct pci_root_info *info; - - /* don't go for it if _CRS is used already */ - if (b->resource[0] != &ioport_resource || - b->resource[1] != &iomem_resource) - return; - - if (!pci_root_num) - return; - - /* for amd, if only one root bus, don't need to do anything */ - if (pci_root_num < 2 && found_all_numa_early) - return; - - for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == b->number) - break; - } - - if (i == pci_root_num) - return; - - printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", - b->number); - - info = &pci_root_info[i]; - for (j = 0; j < info->res_num; j++) { - struct resource *res; - struct resource *root; - - res = &info->res[j]; - b->resource[j] = res; - if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - root = &iomem_resource; - insert_resource(root, res); - } -} - #define RANGE_NUM 16 struct res_range { @@ -119,55 +69,6 @@ static void __init update_range(struct res_range *range, size_t start, } } -void __init update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge) -{ - int i; - struct resource *res; - - if (start > end) - return; - - if (!merge) - goto addit; - - /* try to merge it with old one */ - for (i = 0; i < info->res_num; i++) { - size_t final_start, final_end; - size_t common_start, common_end; - - res = &info->res[i]; - if (res->flags != flags) - continue; - - common_start = max((size_t)res->start, start); - common_end = min((size_t)res->end, end); - if (common_start > common_end + 1) - continue; - - final_start = min((size_t)res->start, start); - final_end = max((size_t)res->end, end); - - res->start = final_start; - res->end = final_end; - return; - } - -addit: - - /* need to add that */ - if (info->res_num >= RES_NUM) - return; - - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = start; - res->end = end; - res->child = NULL; - info->res_num++; -} - struct pci_hostbridge_probe { u32 bus; u32 slot; diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c new file mode 100644 index 000000000000..145df00e0387 --- /dev/null +++ b/arch/x86/pci/bus_numa.c @@ -0,0 +1,101 @@ +#include +#include + +#include "bus_numa.h" + +int pci_root_num; +struct pci_root_info pci_root_info[PCI_ROOT_NR]; +int found_all_numa_early; + +void x86_pci_root_bus_res_quirks(struct pci_bus *b) +{ + int i; + int j; + struct pci_root_info *info; + + /* don't go for it if _CRS is used already */ + if (b->resource[0] != &ioport_resource || + b->resource[1] != &iomem_resource) + return; + + if (!pci_root_num) + return; + + /* for amd, if only one root bus, don't need to do anything */ + if (pci_root_num < 2 && found_all_numa_early) + return; + + for (i = 0; i < pci_root_num; i++) { + if (pci_root_info[i].bus_min == b->number) + break; + } + + if (i == pci_root_num) + return; + + printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", + b->number); + + info = &pci_root_info[i]; + for (j = 0; j < info->res_num; j++) { + struct resource *res; + struct resource *root; + + res = &info->res[j]; + b->resource[j] = res; + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + root = &iomem_resource; + insert_resource(root, res); + } +} + +void __init update_res(struct pci_root_info *info, size_t start, + size_t end, unsigned long flags, int merge) +{ + int i; + struct resource *res; + + if (start > end) + return; + + if (!merge) + goto addit; + + /* try to merge it with old one */ + for (i = 0; i < info->res_num; i++) { + size_t final_start, final_end; + size_t common_start, common_end; + + res = &info->res[i]; + if (res->flags != flags) + continue; + + common_start = max((size_t)res->start, start); + common_end = min((size_t)res->end, end); + if (common_start > common_end + 1) + continue; + + final_start = min((size_t)res->start, start); + final_end = max((size_t)res->end, end); + + res->start = final_start; + res->end = final_end; + return; + } + +addit: + + /* need to add that */ + if (info->res_num >= RES_NUM) + return; + + res = &info->res[info->res_num]; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + res->child = NULL; + info->res_num++; +} diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index 730369f392a3..adbc23fe82ac 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -20,6 +20,7 @@ struct pci_root_info { #define PCI_ROOT_NR 4 extern int pci_root_num; extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; +extern int found_all_numa_early; extern void update_res(struct pci_root_info *info, size_t start, size_t end, unsigned long flags, int merge); -- cgit v1.2.2 From 5663b1b963183e98ece3e77e471da833bb5ad2ff Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:33:37 -0700 Subject: x86/PCI: MMCONFIG: remove unused definitions Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 02642773c29d..9bf04bcfb9c2 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -23,10 +23,6 @@ #define PREFIX "PCI: " -/* aperture is up to 256MB but BIOS may reserve less */ -#define MMCONFIG_APER_MIN (2 * 1024*1024) -#define MMCONFIG_APER_MAX (256 * 1024*1024) - /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; -- cgit v1.2.2 From e823d6ff581c5d1d76aa8c73a202d7d1419d34b8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:33:42 -0700 Subject: x86/PCI: MMCONFIG: count MCFG structures with local variable Use a local variable, not pci_mmcfg_config_num, to count MCFG entries. No functional change, but simplifies future changes. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 9bf04bcfb9c2..fbadb89c71eb 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -555,7 +555,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) { struct acpi_table_mcfg *mcfg; unsigned long i; - int config_size; + int entries, config_size; if (!header) return -EINVAL; @@ -564,17 +564,18 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) /* how many config structures do we have */ pci_mmcfg_config_num = 0; + entries = 0; i = header->length - sizeof(struct acpi_table_mcfg); while (i >= sizeof(struct acpi_mcfg_allocation)) { - ++pci_mmcfg_config_num; + entries++; i -= sizeof(struct acpi_mcfg_allocation); }; - if (pci_mmcfg_config_num == 0) { + if (entries == 0) { printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); return -ENODEV; } - config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); + config_size = entries * sizeof(*pci_mmcfg_config); pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); if (!pci_mmcfg_config) { printk(KERN_WARNING PREFIX @@ -583,8 +584,9 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) } memcpy(pci_mmcfg_config, &mcfg[1], config_size); + pci_mmcfg_config_num = entries; - for (i = 0; i < pci_mmcfg_config_num; ++i) { + for (i = 0; i < entries; i++) { if (acpi_mcfg_check_entry(mcfg, &pci_mmcfg_config[i])) { kfree(pci_mmcfg_config); pci_mmcfg_config_num = 0; -- cgit v1.2.2 From d3578ef7aab5b9bb874d085609b3ed5d9abffc48 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:33:47 -0700 Subject: x86/PCI: MMCONFIG: step through MCFG table, not pci_mmcfg_config[] Step through the ACPI MCFG table, not pci_mmcfg_config[]. No functional change, but simplifies future patches that encapsulate pci_mmcfg_config[]. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index fbadb89c71eb..7a7b6ba3abbb 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -554,6 +554,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, static int __init pci_parse_mcfg(struct acpi_table_header *header) { struct acpi_table_mcfg *mcfg; + struct acpi_mcfg_allocation *cfg_table, *cfg; unsigned long i; int entries, config_size; @@ -586,8 +587,10 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) memcpy(pci_mmcfg_config, &mcfg[1], config_size); pci_mmcfg_config_num = entries; + cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1]; for (i = 0; i < entries; i++) { - if (acpi_mcfg_check_entry(mcfg, &pci_mmcfg_config[i])) { + cfg = &cfg_table[i]; + if (acpi_mcfg_check_entry(mcfg, cfg)) { kfree(pci_mmcfg_config); pci_mmcfg_config_num = 0; return -ENODEV; -- cgit v1.2.2 From 7da7d360ae025158d09aab18d66f5d2fe3c02252 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:33:53 -0700 Subject: x86/PCI: MMCONFIG: centralize MCFG structure management This patch encapsulate pci_mmcfg_config[] updates. All alloc/free is now done in pci_mmconfig_add() and free_all_mcfg(), so all updates to pci_mmcfg_config[] and pci_mmcfg_config_num are in those two functions. This replaces the previous sequence of extend_mmcfg(), fill_one_mmcfg() with the single pci_mmconfig_add() interface. This interface is currently static but will eventually be used in the host bridge hot-add path. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 85 +++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 7a7b6ba3abbb..62a8ecd96980 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -26,14 +26,24 @@ /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; -static __init int extend_mmcfg(int num) +static __init void free_all_mmcfg(void) +{ + pci_mmcfg_arch_free(); + pci_mmcfg_config_num = 0; + kfree(pci_mmcfg_config); + pci_mmcfg_config = NULL; +} + +static __init struct acpi_mcfg_allocation *pci_mmconfig_add(int segment, + int start, int end, u64 addr) { struct acpi_mcfg_allocation *new; - int new_num = pci_mmcfg_config_num + num; + int new_num = pci_mmcfg_config_num + 1; + int i = pci_mmcfg_config_num; new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); if (!new) - return -1; + return NULL; if (pci_mmcfg_config) { memcpy(new, pci_mmcfg_config, @@ -42,18 +52,13 @@ static __init int extend_mmcfg(int num) } pci_mmcfg_config = new; - return 0; -} - -static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end) -{ - int i = pci_mmcfg_config_num; - pci_mmcfg_config_num++; pci_mmcfg_config[i].address = addr; pci_mmcfg_config[i].pci_segment = segment; pci_mmcfg_config[i].start_bus_number = start; pci_mmcfg_config[i].end_bus_number = end; + + return &pci_mmcfg_config[i]; } static const char __init *pci_mmcfg_e7520(void) @@ -65,11 +70,9 @@ static const char __init *pci_mmcfg_e7520(void) if (win == 0x0000 || win == 0xf000) return NULL; - if (extend_mmcfg(1) == -1) + if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL) return NULL; - fill_one_mmcfg(win << 16, 0, 0, 255); - return "Intel Corporation E7520 Memory Controller Hub"; } @@ -111,11 +114,9 @@ static const char __init *pci_mmcfg_intel_945(void) if ((pciexbar & mask) >= 0xf0000000U) return NULL; - if (extend_mmcfg(1) == -1) + if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL) return NULL; - fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1); - return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; } @@ -124,7 +125,7 @@ static const char __init *pci_mmcfg_amd_fam10h(void) u32 low, high, address; u64 base, msr; int i; - unsigned segnbits = 0, busnbits; + unsigned segnbits = 0, busnbits, end_bus; if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) return NULL; @@ -158,11 +159,13 @@ static const char __init *pci_mmcfg_amd_fam10h(void) busnbits = 8; } - if (extend_mmcfg(1 << segnbits) == -1) - return NULL; - + end_bus = (1 << busnbits) - 1; for (i = 0; i < (1 << segnbits); i++) - fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1); + if (pci_mmconfig_add(i, 0, end_bus, + base + (1<<28) * i) == NULL) { + free_all_mmcfg(); + return NULL; + } return "AMD Family 10h NB"; } @@ -210,16 +213,14 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void) if (!(extcfg & extcfg_enable_mask)) continue; - if (extend_mmcfg(1) == -1) - continue; - size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; base = extcfg & extcfg_base_mask[size_index]; /* base could > 4G */ base <<= extcfg_base_lshift; start = (extcfg & extcfg_start_mask) >> extcfg_start_shift; end = start + extcfg_sizebus[size_index] - 1; - fill_one_mmcfg(base, 0, start, end); + if (pci_mmconfig_add(0, start, end, base) == NULL) + continue; mcp55_mmconf_found++; } @@ -303,8 +304,7 @@ static int __init pci_mmcfg_check_hostbridge(void) if (!raw_pci_ops) return 0; - pci_mmcfg_config_num = 0; - pci_mmcfg_config = NULL; + free_all_mmcfg(); for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) { bus = pci_mmcfg_probes[i].bus; @@ -516,10 +516,7 @@ static void __init pci_mmcfg_reject_broken(int early) reject: printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); - pci_mmcfg_arch_free(); - kfree(pci_mmcfg_config); - pci_mmcfg_config = NULL; - pci_mmcfg_config_num = 0; + free_all_mmcfg(); } static int __initdata known_bridge; @@ -556,7 +553,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) struct acpi_table_mcfg *mcfg; struct acpi_mcfg_allocation *cfg_table, *cfg; unsigned long i; - int entries, config_size; + int entries; if (!header) return -EINVAL; @@ -564,7 +561,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) mcfg = (struct acpi_table_mcfg *)header; /* how many config structures do we have */ - pci_mmcfg_config_num = 0; + free_all_mmcfg(); entries = 0; i = header->length - sizeof(struct acpi_table_mcfg); while (i >= sizeof(struct acpi_mcfg_allocation)) { @@ -576,25 +573,21 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) return -ENODEV; } - config_size = entries * sizeof(*pci_mmcfg_config); - pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); - if (!pci_mmcfg_config) { - printk(KERN_WARNING PREFIX - "No memory for MCFG config tables\n"); - return -ENOMEM; - } - - memcpy(pci_mmcfg_config, &mcfg[1], config_size); - pci_mmcfg_config_num = entries; - cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1]; for (i = 0; i < entries; i++) { cfg = &cfg_table[i]; if (acpi_mcfg_check_entry(mcfg, cfg)) { - kfree(pci_mmcfg_config); - pci_mmcfg_config_num = 0; + free_all_mmcfg(); return -ENODEV; } + + if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number, + cfg->end_bus_number, cfg->address) == NULL) { + printk(KERN_WARNING PREFIX + "no memory for MCFG entries\n"); + free_all_mmcfg(); + return -ENOMEM; + } } return 0; -- cgit v1.2.2 From 463a5df175e3ceed684397ee2f8a3eb523d835a0 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:33:58 -0700 Subject: x86/PCI: MMCONFIG: simplify tests for empty pci_mmcfg_config table We never set pci_mmcfg_config unless we increment pci_mmcfg_config_num, so there's no need to test both pci_mmcfg_config_num and pci_mmcfg_config. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 62a8ecd96980..a0cc4d2efb8a 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -472,7 +472,6 @@ static void __init pci_mmcfg_reject_broken(int early) int i; if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || (pci_mmcfg_config[0].address == 0)) return; @@ -618,7 +617,6 @@ static void __init __pci_mmcfg_init(int early) pci_mmcfg_reject_broken(early); if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || (pci_mmcfg_config[0].address == 0)) return; @@ -652,7 +650,6 @@ static int __init pci_mmcfg_late_insert_resources(void) if ((pci_mmcfg_resources_inserted == 1) || (pci_probe & PCI_PROBE_MMCONF) == 0 || (pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || (pci_mmcfg_config[0].address == 0)) return 1; -- cgit v1.2.2 From f7ca69848786bb99fdfafb511791b078c298438e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:03 -0700 Subject: x86/PCI: MMCONFIG: reject MMCONFIG apertures at address zero Since all MMCONFIG regions go through pci_mmconfig_add(), we can test the address once there. If the caller supplies an address of zero, we never insert it in the pci_mmcfg_config[] table, so no need to test it elsewhere. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index a0cc4d2efb8a..067a2cfed15c 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -41,6 +41,9 @@ static __init struct acpi_mcfg_allocation *pci_mmconfig_add(int segment, int new_num = pci_mmcfg_config_num + 1; int i = pci_mmcfg_config_num; + if (addr == 0) + return NULL; + new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); if (!new) return NULL; @@ -471,8 +474,7 @@ static void __init pci_mmcfg_reject_broken(int early) typeof(pci_mmcfg_config[0]) *cfg; int i; - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config[0].address == 0)) + if (pci_mmcfg_config_num == 0) return; for (i = 0; i < pci_mmcfg_config_num; i++) { @@ -616,8 +618,7 @@ static void __init __pci_mmcfg_init(int early) pci_mmcfg_reject_broken(early); - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config[0].address == 0)) + if (pci_mmcfg_config_num == 0) return; if (pci_mmcfg_arch_init()) @@ -649,8 +650,7 @@ static int __init pci_mmcfg_late_insert_resources(void) */ if ((pci_mmcfg_resources_inserted == 1) || (pci_probe & PCI_PROBE_MMCONF) == 0 || - (pci_mmcfg_config_num == 0) || - (pci_mmcfg_config[0].address == 0)) + (pci_mmcfg_config_num == 0)) return 1; /* -- cgit v1.2.2 From df5eb1d67e8074dfbc23cf396c556116728187b3 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:08 -0700 Subject: x86/PCI: MMCONFIG: add PCI_MMCFG_BUS_OFFSET() to factor common expression This factors out the common "bus << 20" expression used when computing the MMCONFIG address. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 2 ++ arch/x86/pci/mmconfig-shared.c | 16 ++++++++-------- arch/x86/pci/mmconfig_32.c | 2 +- arch/x86/pci/mmconfig_64.c | 15 +++++++-------- 4 files changed, 18 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b399988eee3a..7d94a235ec82 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -124,6 +124,8 @@ extern void __init pci_mmcfg_arch_free(void); extern struct acpi_mcfg_allocation *pci_mmcfg_config; extern int pci_mmcfg_config_num; +#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20) + /* * AMD Fam10h CPUs are buggy, and cannot access MMIO config space * on their northbrige except through the * %eax register. As such, you MUST diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 067a2cfed15c..4820f0e8c594 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -355,8 +355,9 @@ static void __init pci_mmcfg_insert_resources(void) snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number); - res->start = cfg->address + (cfg->start_bus_number << 20); - res->end = res->start + (num_buses << 20) - 1; + res->start = cfg->address + + PCI_MMCFG_BUS_OFFSET(cfg->start_bus_number); + res->end = res->start + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; insert_resource(&iomem_resource, res); names += PCI_MMCFG_RESOURCE_NAME_LEN; @@ -478,15 +479,14 @@ static void __init pci_mmcfg_reject_broken(int early) return; for (i = 0; i < pci_mmcfg_config_num; i++) { - int valid = 0; + int num_buses, valid = 0; u64 addr, size; cfg = &pci_mmcfg_config[i]; - addr = cfg->start_bus_number; - addr <<= 20; - addr += cfg->address; - size = cfg->end_bus_number + 1 - cfg->start_bus_number; - size <<= 20; + addr = cfg->address + + PCI_MMCFG_BUS_OFFSET(cfg->start_bus_number); + num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; + size = PCI_MMCFG_BUS_OFFSET(num_buses); printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->pci_segment, diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index f10a7e94a84c..8c19df89ad75 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -47,7 +47,7 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) */ static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn) { - u32 dev_base = base | (bus << 20) | (devfn << 12); + u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12); int cpu = smp_processor_id(); if (dev_base != mmcfg_last_accessed_device || cpu != mmcfg_last_accessed_cpu) { diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 94349f8b2f96..8588711924cc 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -43,7 +43,7 @@ static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned i addr = get_virt(seg, bus); if (!addr) return NULL; - return addr + ((bus << 20) | (devfn << 12)); + return addr + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12)); } static int pci_mmcfg_read(unsigned int seg, unsigned int bus, @@ -113,17 +113,16 @@ static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) { void __iomem *addr; u64 start, size; + int num_buses; - start = cfg->start_bus_number; - start <<= 20; - start += cfg->address; - size = cfg->end_bus_number + 1 - cfg->start_bus_number; - size <<= 20; + start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus_number); + num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; + size = PCI_MMCFG_BUS_OFFSET(num_buses); addr = ioremap_nocache(start, size); if (addr) { printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", start, start + size - 1); - addr -= cfg->start_bus_number << 20; + addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus_number); } return addr; } @@ -162,7 +161,7 @@ void __init pci_mmcfg_arch_free(void) for (i = 0; i < pci_mmcfg_config_num; ++i) { if (pci_mmcfg_virt[i].virt) { - iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20)); + iounmap(pci_mmcfg_virt[i].virt + PCI_MMCFG_BUS_OFFSET(pci_mmcfg_virt[i].cfg->start_bus_number)); pci_mmcfg_virt[i].virt = NULL; pci_mmcfg_virt[i].cfg = NULL; } -- cgit v1.2.2 From d215a9c8b46e55a1d3bc1cd907c943ef95938a0e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:13 -0700 Subject: x86/PCI: MMCONFIG: use a private structure rather than the ACPI MCFG one This adds a struct pci_mmcfg_region with a little more information than the struct acpi_mcfg_allocation used previously. The acpi_mcfg structure is defined by the spec, so we can't change it. To begin with, struct pci_mmcfg_region is basically the same as the ACPI MCFG version, but future patches will add more information. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 9 ++++++++- arch/x86/pci/mmconfig-shared.c | 10 +++++----- arch/x86/pci/mmconfig_32.c | 2 +- arch/x86/pci/mmconfig_64.c | 6 +++--- 4 files changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 7d94a235ec82..3a2ca5f69521 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -118,10 +118,17 @@ extern int __init pcibios_init(void); /* pci-mmconfig.c */ +struct pci_mmcfg_region { + u64 address; + u16 pci_segment; + u8 start_bus_number; + u8 end_bus_number; +}; + extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); -extern struct acpi_mcfg_allocation *pci_mmcfg_config; +extern struct pci_mmcfg_region *pci_mmcfg_config; extern int pci_mmcfg_config_num; #define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20) diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 4820f0e8c594..5f7afdd1e2d6 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -34,10 +34,10 @@ static __init void free_all_mmcfg(void) pci_mmcfg_config = NULL; } -static __init struct acpi_mcfg_allocation *pci_mmconfig_add(int segment, - int start, int end, u64 addr) +static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, + int end, u64 addr) { - struct acpi_mcfg_allocation *new; + struct pci_mmcfg_region *new; int new_num = pci_mmcfg_config_num + 1; int i = pci_mmcfg_config_num; @@ -349,7 +349,7 @@ static void __init pci_mmcfg_insert_resources(void) names = (void *)&res[pci_mmcfg_config_num]; for (i = 0; i < pci_mmcfg_config_num; i++, res++) { - struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i]; + struct pci_mmcfg_region *cfg = &pci_mmcfg_config[i]; num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; res->name = names; snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, @@ -523,7 +523,7 @@ reject: static int __initdata known_bridge; /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -struct acpi_mcfg_allocation *pci_mmcfg_config; +struct pci_mmcfg_region *pci_mmcfg_config; int pci_mmcfg_config_num; static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 8c19df89ad75..3936eced993c 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -27,7 +27,7 @@ static int mmcfg_last_accessed_cpu; */ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) { - struct acpi_mcfg_allocation *cfg; + struct pci_mmcfg_region *cfg; int cfg_num; for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 8588711924cc..7a6231c3335e 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -14,14 +14,14 @@ /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { - struct acpi_mcfg_allocation *cfg; + struct pci_mmcfg_region *cfg; char __iomem *virt; }; static struct mmcfg_virt *pci_mmcfg_virt; static char __iomem *get_virt(unsigned int seg, unsigned bus) { - struct acpi_mcfg_allocation *cfg; + struct pci_mmcfg_region *cfg; int cfg_num; for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { @@ -109,7 +109,7 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) +static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) { void __iomem *addr; u64 start, size; -- cgit v1.2.2 From d7e6b66fe87c9f42480d73fc314aecaeae84ca6b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:18 -0700 Subject: x86/PCI: MMCONFIG: rename pci_mmcfg_region structure members This only renames the struct pci_mmcfg_region members; no functional change. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 6 ++--- arch/x86/pci/mmconfig-shared.c | 50 +++++++++++++++++++++--------------------- arch/x86/pci/mmconfig_32.c | 6 ++--- arch/x86/pci/mmconfig_64.c | 16 +++++++------- 4 files changed, 39 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 3a2ca5f69521..a752d618f196 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -120,9 +120,9 @@ extern int __init pcibios_init(void); struct pci_mmcfg_region { u64 address; - u16 pci_segment; - u8 start_bus_number; - u8 end_bus_number; + u16 segment; + u8 start_bus; + u8 end_bus; }; extern int __init pci_mmcfg_arch_init(void); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 5f7afdd1e2d6..5479fbb2d6ab 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -57,9 +57,9 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, pci_mmcfg_config_num++; pci_mmcfg_config[i].address = addr; - pci_mmcfg_config[i].pci_segment = segment; - pci_mmcfg_config[i].start_bus_number = start; - pci_mmcfg_config[i].end_bus_number = end; + pci_mmcfg_config[i].segment = segment; + pci_mmcfg_config[i].start_bus = start; + pci_mmcfg_config[i].end_bus = end; return &pci_mmcfg_config[i]; } @@ -260,8 +260,8 @@ static int __init cmp_mmcfg(const void *x1, const void *x2) const typeof(pci_mmcfg_config[0]) *m2 = x2; int start1, start2; - start1 = m1->start_bus_number; - start2 = m2->start_bus_number; + start1 = m1->start_bus; + start2 = m2->start_bus; return start1 - start2; } @@ -279,8 +279,8 @@ static void __init pci_mmcfg_check_end_bus_number(void) if (pci_mmcfg_config_num > 0) { i = pci_mmcfg_config_num - 1; cfg = &pci_mmcfg_config[i]; - if (cfg->end_bus_number < cfg->start_bus_number) - cfg->end_bus_number = 255; + if (cfg->end_bus < cfg->start_bus) + cfg->end_bus = 255; } /* don't overlap please */ @@ -288,11 +288,11 @@ static void __init pci_mmcfg_check_end_bus_number(void) cfg = &pci_mmcfg_config[i]; cfgx = &pci_mmcfg_config[i+1]; - if (cfg->end_bus_number < cfg->start_bus_number) - cfg->end_bus_number = 255; + if (cfg->end_bus < cfg->start_bus) + cfg->end_bus = 255; - if (cfg->end_bus_number >= cfgx->start_bus_number) - cfg->end_bus_number = cfgx->start_bus_number - 1; + if (cfg->end_bus >= cfgx->start_bus) + cfg->end_bus = cfgx->start_bus - 1; } } @@ -350,13 +350,13 @@ static void __init pci_mmcfg_insert_resources(void) names = (void *)&res[pci_mmcfg_config_num]; for (i = 0; i < pci_mmcfg_config_num; i++, res++) { struct pci_mmcfg_region *cfg = &pci_mmcfg_config[i]; - num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; + num_buses = cfg->end_bus - cfg->start_bus + 1; res->name = names; snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, - "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment, - cfg->start_bus_number, cfg->end_bus_number); + "PCI MMCONFIG %u [%02x-%02x]", cfg->segment, + cfg->start_bus, cfg->end_bus); res->start = cfg->address + - PCI_MMCFG_BUS_OFFSET(cfg->start_bus_number); + PCI_MMCFG_BUS_OFFSET(cfg->start_bus); res->end = res->start + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; insert_resource(&iomem_resource, res); @@ -457,13 +457,13 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, valid = 1; if (old_size != size) { - /* update end_bus_number */ - cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); + /* update end_bus */ + cfg->end_bus = cfg->start_bus + ((size>>20) - 1); printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", - i, (unsigned long)cfg->address, cfg->pci_segment, - (unsigned int)cfg->start_bus_number, - (unsigned int)cfg->end_bus_number); + i, (unsigned long)cfg->address, cfg->segment, + (unsigned int)cfg->start_bus, + (unsigned int)cfg->end_bus); } } @@ -484,14 +484,14 @@ static void __init pci_mmcfg_reject_broken(int early) cfg = &pci_mmcfg_config[i]; addr = cfg->address + - PCI_MMCFG_BUS_OFFSET(cfg->start_bus_number); - num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; + PCI_MMCFG_BUS_OFFSET(cfg->start_bus); + num_buses = cfg->end_bus - cfg->start_bus + 1; size = PCI_MMCFG_BUS_OFFSET(num_buses); printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", - i, (unsigned long)cfg->address, cfg->pci_segment, - (unsigned int)cfg->start_bus_number, - (unsigned int)cfg->end_bus_number); + i, (unsigned long)cfg->address, cfg->segment, + (unsigned int)cfg->start_bus, + (unsigned int)cfg->end_bus); if (!early && !acpi_disabled) valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 3936eced993c..a3cee532c935 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -32,9 +32,9 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = &pci_mmcfg_config[cfg_num]; - if (cfg->pci_segment == seg && - (cfg->start_bus_number <= bus) && - (cfg->end_bus_number >= bus)) + if (cfg->segment == seg && + (cfg->start_bus <= bus) && + (cfg->end_bus >= bus)) return cfg->address; } diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 7a6231c3335e..fdf08f97131b 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -26,9 +26,9 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = pci_mmcfg_virt[cfg_num].cfg; - if (cfg->pci_segment == seg && - (cfg->start_bus_number <= bus) && - (cfg->end_bus_number >= bus)) + if (cfg->segment == seg && + (cfg->start_bus <= bus) && + (cfg->end_bus >= bus)) return pci_mmcfg_virt[cfg_num].virt; } @@ -115,14 +115,14 @@ static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) u64 start, size; int num_buses; - start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus_number); - num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; + start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus); + num_buses = cfg->end_bus - cfg->start_bus + 1; size = PCI_MMCFG_BUS_OFFSET(num_buses); addr = ioremap_nocache(start, size); if (addr) { printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", start, start + size - 1); - addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus_number); + addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus); } return addr; } @@ -143,7 +143,7 @@ int __init pci_mmcfg_arch_init(void) if (!pci_mmcfg_virt[i].virt) { printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " "segment %d\n", - pci_mmcfg_config[i].pci_segment); + pci_mmcfg_config[i].segment); pci_mmcfg_arch_free(); return 0; } @@ -161,7 +161,7 @@ void __init pci_mmcfg_arch_free(void) for (i = 0; i < pci_mmcfg_config_num; ++i) { if (pci_mmcfg_virt[i].virt) { - iounmap(pci_mmcfg_virt[i].virt + PCI_MMCFG_BUS_OFFSET(pci_mmcfg_virt[i].cfg->start_bus_number)); + iounmap(pci_mmcfg_virt[i].virt + PCI_MMCFG_BUS_OFFSET(pci_mmcfg_virt[i].cfg->start_bus)); pci_mmcfg_virt[i].virt = NULL; pci_mmcfg_virt[i].cfg = NULL; } -- cgit v1.2.2 From 95cf1cf0c5a767feb811dfed298b95b1df8824c7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:24 -0700 Subject: x86/PCI: MMCONFIG: use pointer to simplify pci_mmcfg_config[] structure access No functional change, but simplifies a future patch to convert the table to a list. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 5479fbb2d6ab..28ac9f58a986 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -54,12 +54,14 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, kfree(pci_mmcfg_config); } pci_mmcfg_config = new; - pci_mmcfg_config_num++; - pci_mmcfg_config[i].address = addr; - pci_mmcfg_config[i].segment = segment; - pci_mmcfg_config[i].start_bus = start; - pci_mmcfg_config[i].end_bus = end; + + new = &pci_mmcfg_config[i]; + + new->address = addr; + new->segment = segment; + new->start_bus = start; + new->end_bus = end; return &pci_mmcfg_config[i]; } -- cgit v1.2.2 From 56ddf4d3cf04e80254d3d721c6bea2f8ec44c41a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:29 -0700 Subject: x86/PCI: MMCONFIG: add resource to struct pci_mmcfg_region This patch adds a resource and corresponding name to the MMCONFIG structure. This makes allocation simpler (we can allocate the resource and name at the same time we allocate the pci_mmcfg_region), and gives us a way to hang onto the resource after inserting it. This will be needed so we can release and free it when hot-removing a host bridge. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 5 ++++ arch/x86/pci/mmconfig-shared.c | 64 ++++++++++++++++++++++-------------------- 2 files changed, 38 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index a752d618f196..a6d42c10b017 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -118,11 +118,16 @@ extern int __init pcibios_init(void); /* pci-mmconfig.c */ +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */ +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2) + struct pci_mmcfg_region { + struct resource res; u64 address; u16 segment; u8 start_bus; u8 end_bus; + char name[PCI_MMCFG_RESOURCE_NAME_LEN]; }; extern int __init pci_mmcfg_arch_init(void); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 28ac9f58a986..ba3aa3697418 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -28,7 +28,15 @@ static int __initdata pci_mmcfg_resources_inserted; static __init void free_all_mmcfg(void) { + int i; + struct pci_mmcfg_region *cfg; + pci_mmcfg_arch_free(); + for (i = 0; i < pci_mmcfg_config_num; i++) { + cfg = &pci_mmcfg_config[i]; + if (cfg->res.parent) + release_resource(&cfg->res); + } pci_mmcfg_config_num = 0; kfree(pci_mmcfg_config); pci_mmcfg_config = NULL; @@ -40,6 +48,8 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, struct pci_mmcfg_region *new; int new_num = pci_mmcfg_config_num + 1; int i = pci_mmcfg_config_num; + int num_buses; + struct resource *res; if (addr == 0) return NULL; @@ -63,6 +73,15 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, new->start_bus = start; new->end_bus = end; + num_buses = end - start + 1; + res = &new->res; + res->start = addr + PCI_MMCFG_BUS_OFFSET(start); + res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, + "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); + res->name = new->name; + return &pci_mmcfg_config[i]; } @@ -336,33 +355,12 @@ static int __init pci_mmcfg_check_hostbridge(void) static void __init pci_mmcfg_insert_resources(void) { -#define PCI_MMCFG_RESOURCE_NAME_LEN 24 int i; - struct resource *res; - char *names; - unsigned num_buses; - - res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), - pci_mmcfg_config_num, GFP_KERNEL); - if (!res) { - printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); - return; - } + struct pci_mmcfg_region *cfg; - names = (void *)&res[pci_mmcfg_config_num]; - for (i = 0; i < pci_mmcfg_config_num; i++, res++) { - struct pci_mmcfg_region *cfg = &pci_mmcfg_config[i]; - num_buses = cfg->end_bus - cfg->start_bus + 1; - res->name = names; - snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, - "PCI MMCONFIG %u [%02x-%02x]", cfg->segment, - cfg->start_bus, cfg->end_bus); - res->start = cfg->address + - PCI_MMCFG_BUS_OFFSET(cfg->start_bus); - res->end = res->start + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); - names += PCI_MMCFG_RESOURCE_NAME_LEN; + for (i = 0; i < pci_mmcfg_config_num; i++) { + cfg = &pci_mmcfg_config[i]; + insert_resource(&iomem_resource, &cfg->res); } /* Mark that the resources have been inserted. */ @@ -444,7 +442,7 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, typeof(pci_mmcfg_config[0]) *cfg, int with_e820) { u64 old_size = size; - int valid = 0; + int valid = 0, num_buses; while (!is_reserved(addr, addr + size, E820_RESERVED)) { size >>= 1; @@ -461,6 +459,12 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, if (old_size != size) { /* update end_bus */ cfg->end_bus = cfg->start_bus + ((size>>20) - 1); + num_buses = cfg->end_bus - cfg->start_bus + 1; + cfg->res.end = cfg->res.start + + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, + "PCI MMCONFIG %04x [bus %02x-%02x]", + cfg->segment, cfg->start_bus, cfg->end_bus); printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->segment, @@ -481,14 +485,12 @@ static void __init pci_mmcfg_reject_broken(int early) return; for (i = 0; i < pci_mmcfg_config_num; i++) { - int num_buses, valid = 0; + int valid = 0; u64 addr, size; cfg = &pci_mmcfg_config[i]; - addr = cfg->address + - PCI_MMCFG_BUS_OFFSET(cfg->start_bus); - num_buses = cfg->end_bus - cfg->start_bus + 1; - size = PCI_MMCFG_BUS_OFFSET(num_buses); + addr = cfg->res.start; + size = resource_size(&cfg->res); printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->segment, -- cgit v1.2.2 From 2f2a8b9c90279e75f87aaf322a948bdced27e89f Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:34 -0700 Subject: x86/PCI: MMCONFIG: trivial is_mmconf_reserved() interface simplification Since pci_mmcfg_region contains the struct resource, no need to pass the pci_mmcfg_region *and* the resource start/size. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index ba3aa3697418..90422b4a7c91 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -438,9 +438,10 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); static int __init is_mmconf_reserved(check_reserved_t is_reserved, - u64 addr, u64 size, int i, - typeof(pci_mmcfg_config[0]) *cfg, int with_e820) + int i, typeof(pci_mmcfg_config[0]) *cfg, int with_e820) { + u64 addr = cfg->res.start; + u64 size = resource_size(&cfg->res); u64 old_size = size; int valid = 0, num_buses; @@ -486,11 +487,8 @@ static void __init pci_mmcfg_reject_broken(int early) for (i = 0; i < pci_mmcfg_config_num; i++) { int valid = 0; - u64 addr, size; cfg = &pci_mmcfg_config[i]; - addr = cfg->res.start; - size = resource_size(&cfg->res); printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->segment, @@ -498,7 +496,7 @@ static void __init pci_mmcfg_reject_broken(int early) (unsigned int)cfg->end_bus); if (!early && !acpi_disabled) - valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); + valid = is_mmconf_reserved(is_acpi_reserved, i, cfg, 0); if (valid) continue; @@ -511,7 +509,7 @@ static void __init pci_mmcfg_reject_broken(int early) /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ if (raw_pci_ops) - valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); + valid = is_mmconf_reserved(e820_all_mapped, i, cfg, 1); if (!valid) goto reject; -- cgit v1.2.2 From 3f0f5503926f7447615f083c2d57545a83b6357c Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:39 -0700 Subject: x86/PCI: MMCONFIG: add virtual address to struct pci_mmcfg_region The virtual address is only used for x86_64, but it's so much simpler to manage it as part of the pci_mmcfg_region that I think it's worth wasting a pointer per MMCONFIG region on x86_32. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/mmconfig_64.c | 45 +++++++++++++----------------------------- 2 files changed, 15 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index a6d42c10b017..7aa2ed8f25ab 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -124,6 +124,7 @@ extern int __init pcibios_init(void); struct pci_mmcfg_region { struct resource res; u64 address; + char __iomem *virt; u16 segment; u8 start_bus; u8 end_bus; diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index fdf08f97131b..78fa05c6c04d 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -12,24 +12,17 @@ #include #include -/* Static virtual mapping of the MMCONFIG aperture */ -struct mmcfg_virt { - struct pci_mmcfg_region *cfg; - char __iomem *virt; -}; -static struct mmcfg_virt *pci_mmcfg_virt; - static char __iomem *get_virt(unsigned int seg, unsigned bus) { + int i; struct pci_mmcfg_region *cfg; - int cfg_num; - for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { - cfg = pci_mmcfg_virt[cfg_num].cfg; + for (i = 0; i < pci_mmcfg_config_num; ++i) { + cfg = &pci_mmcfg_config[i]; if (cfg->segment == seg && (cfg->start_bus <= bus) && (cfg->end_bus >= bus)) - return pci_mmcfg_virt[cfg_num].virt; + return cfg->virt; } /* Fall back to type 0 */ @@ -130,20 +123,15 @@ static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) int __init pci_mmcfg_arch_init(void) { int i; - pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) * - pci_mmcfg_config_num, GFP_KERNEL); - if (pci_mmcfg_virt == NULL) { - printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); - return 0; - } + struct pci_mmcfg_region *cfg; for (i = 0; i < pci_mmcfg_config_num; ++i) { - pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; - pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); - if (!pci_mmcfg_virt[i].virt) { + cfg = &pci_mmcfg_config[i]; + cfg->virt = mcfg_ioremap(cfg); + if (!cfg->virt) { printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " "segment %d\n", - pci_mmcfg_config[i].segment); + cfg->segment); pci_mmcfg_arch_free(); return 0; } @@ -155,18 +143,13 @@ int __init pci_mmcfg_arch_init(void) void __init pci_mmcfg_arch_free(void) { int i; - - if (pci_mmcfg_virt == NULL) - return; + struct pci_mmcfg_region *cfg; for (i = 0; i < pci_mmcfg_config_num; ++i) { - if (pci_mmcfg_virt[i].virt) { - iounmap(pci_mmcfg_virt[i].virt + PCI_MMCFG_BUS_OFFSET(pci_mmcfg_virt[i].cfg->start_bus)); - pci_mmcfg_virt[i].virt = NULL; - pci_mmcfg_virt[i].cfg = NULL; + cfg = &pci_mmcfg_config[i]; + if (cfg->virt) { + iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); + cfg->virt = NULL; } } - - kfree(pci_mmcfg_virt); - pci_mmcfg_virt = NULL; } -- cgit v1.2.2 From 987c367b4e93be6826394e7c9cc14d28bb5c8810 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:44 -0700 Subject: x86/PCI: MMCONFIG: remove typeof so we can use a list This replaces "typeof(pci_mmcfg_config[0])" with the actual type because I plan to convert pci_mmcfg_config to a list, and then "pci_mmcfg_config[0]" won't mean anything. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 90422b4a7c91..6eeeac0d25f4 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -277,8 +277,8 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { static int __init cmp_mmcfg(const void *x1, const void *x2) { - const typeof(pci_mmcfg_config[0]) *m1 = x1; - const typeof(pci_mmcfg_config[0]) *m2 = x2; + const struct pci_mmcfg_region *m1 = x1; + const struct pci_mmcfg_region *m2 = x2; int start1, start2; start1 = m1->start_bus; @@ -290,7 +290,7 @@ static int __init cmp_mmcfg(const void *x1, const void *x2) static void __init pci_mmcfg_check_end_bus_number(void) { int i; - typeof(pci_mmcfg_config[0]) *cfg, *cfgx; + struct pci_mmcfg_region *cfg, *cfgx; /* sort them at first */ sort(pci_mmcfg_config, pci_mmcfg_config_num, @@ -438,7 +438,7 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); static int __init is_mmconf_reserved(check_reserved_t is_reserved, - int i, typeof(pci_mmcfg_config[0]) *cfg, int with_e820) + int i, struct pci_mmcfg_region *cfg, int with_e820) { u64 addr = cfg->res.start; u64 size = resource_size(&cfg->res); @@ -479,7 +479,7 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, static void __init pci_mmcfg_reject_broken(int early) { - typeof(pci_mmcfg_config[0]) *cfg; + struct pci_mmcfg_region *cfg; int i; if (pci_mmcfg_config_num == 0) -- cgit v1.2.2 From ff097ddd4aeac790fd51d013c79c2f18ec9a7117 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:49 -0700 Subject: x86/PCI: MMCONFIG: manage pci_mmcfg_region as a list, not a table This changes pci_mmcfg_region from a table to a list, to make it easier to add and remove MMCONFIG regions for PCI host bridge hotplug. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 4 +- arch/x86/pci/mmconfig-shared.c | 106 ++++++++++++++++------------------------- arch/x86/pci/mmconfig_32.c | 5 +- arch/x86/pci/mmconfig_64.c | 13 ++--- 4 files changed, 47 insertions(+), 81 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 7aa2ed8f25ab..0b7c316a70c3 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -122,6 +122,7 @@ extern int __init pcibios_init(void); #define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2) struct pci_mmcfg_region { + struct list_head list; struct resource res; u64 address; char __iomem *virt; @@ -134,8 +135,7 @@ struct pci_mmcfg_region { extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); -extern struct pci_mmcfg_region *pci_mmcfg_config; -extern int pci_mmcfg_config_num; +extern struct list_head pci_mmcfg_list; #define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20) diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 6eeeac0d25f4..2709aa81801d 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -26,53 +25,58 @@ /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; +LIST_HEAD(pci_mmcfg_list); + static __init void free_all_mmcfg(void) { - int i; - struct pci_mmcfg_region *cfg; + struct pci_mmcfg_region *cfg, *tmp; pci_mmcfg_arch_free(); - for (i = 0; i < pci_mmcfg_config_num; i++) { - cfg = &pci_mmcfg_config[i]; + list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list) { if (cfg->res.parent) release_resource(&cfg->res); + list_del(&cfg->list); + kfree(cfg); } - pci_mmcfg_config_num = 0; - kfree(pci_mmcfg_config); - pci_mmcfg_config = NULL; +} + +static __init void list_add_sorted(struct pci_mmcfg_region *new) +{ + struct pci_mmcfg_region *cfg; + + /* keep list sorted by segment and starting bus number */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->segment > new->segment || + (cfg->segment == new->segment && + cfg->start_bus >= new->start_bus)) { + list_add_tail(&new->list, &cfg->list); + return; + } + } + list_add_tail(&new->list, &pci_mmcfg_list); } static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, int end, u64 addr) { struct pci_mmcfg_region *new; - int new_num = pci_mmcfg_config_num + 1; - int i = pci_mmcfg_config_num; int num_buses; struct resource *res; if (addr == 0) return NULL; - new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); + new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; - if (pci_mmcfg_config) { - memcpy(new, pci_mmcfg_config, - sizeof(pci_mmcfg_config[0]) * new_num); - kfree(pci_mmcfg_config); - } - pci_mmcfg_config = new; - pci_mmcfg_config_num++; - - new = &pci_mmcfg_config[i]; - new->address = addr; new->segment = segment; new->start_bus = start; new->end_bus = end; + list_add_sorted(new); + num_buses = end - start + 1; res = &new->res; res->start = addr + PCI_MMCFG_BUS_OFFSET(start); @@ -82,7 +86,7 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); res->name = new->name; - return &pci_mmcfg_config[i]; + return new; } static const char __init *pci_mmcfg_e7520(void) @@ -214,7 +218,7 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void) /* * do check if amd fam10h already took over */ - if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) + if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked) return NULL; mcp55_checked = true; @@ -275,44 +279,26 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { 0x0369, pci_mmcfg_nvidia_mcp55 }, }; -static int __init cmp_mmcfg(const void *x1, const void *x2) -{ - const struct pci_mmcfg_region *m1 = x1; - const struct pci_mmcfg_region *m2 = x2; - int start1, start2; - - start1 = m1->start_bus; - start2 = m2->start_bus; - - return start1 - start2; -} - static void __init pci_mmcfg_check_end_bus_number(void) { - int i; struct pci_mmcfg_region *cfg, *cfgx; - /* sort them at first */ - sort(pci_mmcfg_config, pci_mmcfg_config_num, - sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL); - /* last one*/ - if (pci_mmcfg_config_num > 0) { - i = pci_mmcfg_config_num - 1; - cfg = &pci_mmcfg_config[i]; + cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list); + if (cfg) if (cfg->end_bus < cfg->start_bus) cfg->end_bus = 255; - } - /* don't overlap please */ - for (i = 0; i < pci_mmcfg_config_num - 1; i++) { - cfg = &pci_mmcfg_config[i]; - cfgx = &pci_mmcfg_config[i+1]; + if (list_is_singular(&pci_mmcfg_list)) + return; + /* don't overlap please */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { if (cfg->end_bus < cfg->start_bus) cfg->end_bus = 255; - if (cfg->end_bus >= cfgx->start_bus) + cfgx = list_entry(cfg->list.next, typeof(*cfg), list); + if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus) cfg->end_bus = cfgx->start_bus - 1; } } @@ -350,18 +336,15 @@ static int __init pci_mmcfg_check_hostbridge(void) /* some end_bus_number is crazy, fix it */ pci_mmcfg_check_end_bus_number(); - return pci_mmcfg_config_num != 0; + return !list_empty(&pci_mmcfg_list); } static void __init pci_mmcfg_insert_resources(void) { - int i; struct pci_mmcfg_region *cfg; - for (i = 0; i < pci_mmcfg_config_num; i++) { - cfg = &pci_mmcfg_config[i]; + list_for_each_entry(cfg, &pci_mmcfg_list, list) insert_resource(&iomem_resource, &cfg->res); - } /* Mark that the resources have been inserted. */ pci_mmcfg_resources_inserted = 1; @@ -482,18 +465,15 @@ static void __init pci_mmcfg_reject_broken(int early) struct pci_mmcfg_region *cfg; int i; - if (pci_mmcfg_config_num == 0) - return; - - for (i = 0; i < pci_mmcfg_config_num; i++) { + list_for_each_entry(cfg, &pci_mmcfg_list, list) { int valid = 0; - cfg = &pci_mmcfg_config[i]; printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->segment, (unsigned int)cfg->start_bus, (unsigned int)cfg->end_bus); + i++; if (!early && !acpi_disabled) valid = is_mmconf_reserved(is_acpi_reserved, i, cfg, 0); @@ -524,10 +504,6 @@ reject: static int __initdata known_bridge; -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -struct pci_mmcfg_region *pci_mmcfg_config; -int pci_mmcfg_config_num; - static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, struct acpi_mcfg_allocation *cfg) { @@ -620,7 +596,7 @@ static void __init __pci_mmcfg_init(int early) pci_mmcfg_reject_broken(early); - if (pci_mmcfg_config_num == 0) + if (list_empty(&pci_mmcfg_list)) return; if (pci_mmcfg_arch_init()) @@ -652,7 +628,7 @@ static int __init pci_mmcfg_late_insert_resources(void) */ if ((pci_mmcfg_resources_inserted == 1) || (pci_probe & PCI_PROBE_MMCONF) == 0 || - (pci_mmcfg_config_num == 0)) + list_empty(&pci_mmcfg_list)) return 1; /* diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index a3cee532c935..c04523e09649 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -28,15 +28,12 @@ static int mmcfg_last_accessed_cpu; static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) { struct pci_mmcfg_region *cfg; - int cfg_num; - for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { - cfg = &pci_mmcfg_config[cfg_num]; + list_for_each_entry(cfg, &pci_mmcfg_list, list) if (cfg->segment == seg && (cfg->start_bus <= bus) && (cfg->end_bus >= bus)) return cfg->address; - } /* Fall back to type 0 */ return 0; diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 78fa05c6c04d..ed1f479b4d0e 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -14,16 +14,13 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) { - int i; struct pci_mmcfg_region *cfg; - for (i = 0; i < pci_mmcfg_config_num; ++i) { - cfg = &pci_mmcfg_config[i]; + list_for_each_entry(cfg, &pci_mmcfg_list, list) if (cfg->segment == seg && (cfg->start_bus <= bus) && (cfg->end_bus >= bus)) return cfg->virt; - } /* Fall back to type 0 */ return NULL; @@ -122,11 +119,9 @@ static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) int __init pci_mmcfg_arch_init(void) { - int i; struct pci_mmcfg_region *cfg; - for (i = 0; i < pci_mmcfg_config_num; ++i) { - cfg = &pci_mmcfg_config[i]; + list_for_each_entry(cfg, &pci_mmcfg_list, list) { cfg->virt = mcfg_ioremap(cfg); if (!cfg->virt) { printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " @@ -142,11 +137,9 @@ int __init pci_mmcfg_arch_init(void) void __init pci_mmcfg_arch_free(void) { - int i; struct pci_mmcfg_region *cfg; - for (i = 0; i < pci_mmcfg_config_num; ++i) { - cfg = &pci_mmcfg_config[i]; + list_for_each_entry(cfg, &pci_mmcfg_list, list) { if (cfg->virt) { iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); cfg->virt = NULL; -- cgit v1.2.2 From ba2afbabfc44d6322e8607c004f37868ff786cf8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:54 -0700 Subject: x86/PCI: MMCONFIG: add pci_mmconfig_remove() to remove MMCONFIG region This is only used internally now, but eventually will be used in the hot-remove path to remove the MMCONFIG region associated with a host bridge. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 2709aa81801d..392f8fe16955 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -27,17 +27,21 @@ static int __initdata pci_mmcfg_resources_inserted; LIST_HEAD(pci_mmcfg_list); +static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg) +{ + if (cfg->res.parent) + release_resource(&cfg->res); + list_del(&cfg->list); + kfree(cfg); +} + static __init void free_all_mmcfg(void) { struct pci_mmcfg_region *cfg, *tmp; pci_mmcfg_arch_free(); - list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list) { - if (cfg->res.parent) - release_resource(&cfg->res); - list_del(&cfg->list); - kfree(cfg); - } + list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list) + pci_mmconfig_remove(cfg); } static __init void list_add_sorted(struct pci_mmcfg_region *new) -- cgit v1.2.2 From 8c57786ad3d921713c7ad8e44132aa537a1d0fec Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:34:59 -0700 Subject: x86/PCI: MMCONFIG: clean up printks No functional change; just tidy up printks and make them more consistent with the rest of PCI. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 46 +++++++++++++++++++----------------------- arch/x86/pci/mmconfig_64.c | 12 +++++------ 2 files changed, 26 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 392f8fe16955..71d69b88fa33 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -90,6 +90,10 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); res->name = new->name; + printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at " + "%pR (base %#lx)\n", segment, start, end, &new->res, + (unsigned long) addr); + return new; } @@ -333,7 +337,7 @@ static int __init pci_mmcfg_check_hostbridge(void) name = pci_mmcfg_probes[i].probe(); if (name) - printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", + printk(KERN_INFO PREFIX "%s with MMCONFIG support\n", name); } @@ -425,7 +429,7 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); static int __init is_mmconf_reserved(check_reserved_t is_reserved, - int i, struct pci_mmcfg_region *cfg, int with_e820) + struct pci_mmcfg_region *cfg, int with_e820) { u64 addr = cfg->res.start; u64 size = resource_size(&cfg->res); @@ -439,9 +443,9 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, } if (size >= (16UL<<20) || size == old_size) { - printk(KERN_NOTICE - "PCI: MCFG area at %Lx reserved in %s\n", - addr, with_e820?"E820":"ACPI motherboard resources"); + printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n", + &cfg->res, + with_e820 ? "E820" : "ACPI motherboard resources"); valid = 1; if (old_size != size) { @@ -453,11 +457,11 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %04x [bus %02x-%02x]", cfg->segment, cfg->start_bus, cfg->end_bus); - printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " - "segment %hu buses %u - %u\n", - i, (unsigned long)cfg->address, cfg->segment, - (unsigned int)cfg->start_bus, - (unsigned int)cfg->end_bus); + printk(KERN_INFO PREFIX + "MMCONFIG for %04x [bus%02x-%02x] " + "at %pR (base %#lx) (size reduced!)\n", + cfg->segment, cfg->start_bus, cfg->end_bus, + &cfg->res, (unsigned long) cfg->address); } } @@ -467,33 +471,25 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, static void __init pci_mmcfg_reject_broken(int early) { struct pci_mmcfg_region *cfg; - int i; list_for_each_entry(cfg, &pci_mmcfg_list, list) { int valid = 0; - printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " - "segment %hu buses %u - %u\n", - i, (unsigned long)cfg->address, cfg->segment, - (unsigned int)cfg->start_bus, - (unsigned int)cfg->end_bus); - i++; - if (!early && !acpi_disabled) - valid = is_mmconf_reserved(is_acpi_reserved, i, cfg, 0); + valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); if (valid) continue; if (!early) - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" - " reserved in ACPI motherboard resources\n", - cfg->address); + printk(KERN_ERR FW_BUG PREFIX + "MMCONFIG at %pR not reserved in " + "ACPI motherboard resources\n", &cfg->res); /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ if (raw_pci_ops) - valid = is_mmconf_reserved(e820_all_mapped, i, cfg, 1); + valid = is_mmconf_reserved(e820_all_mapped, cfg, 1); if (!valid) goto reject; @@ -502,7 +498,7 @@ static void __init pci_mmcfg_reject_broken(int early) return; reject: - printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); + printk(KERN_INFO PREFIX "not using MMCONFIG\n"); free_all_mmcfg(); } @@ -525,7 +521,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, return 0; } - printk(KERN_ERR PREFIX "MCFG region for %04x:%02x-%02x at %#llx " + printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " "is above 4GB, ignored\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number, cfg->address); return -EINVAL; diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index ed1f479b4d0e..cfa6cdb6d262 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -12,6 +12,8 @@ #include #include +#define PREFIX "PCI: " + static char __iomem *get_virt(unsigned int seg, unsigned bus) { struct pci_mmcfg_region *cfg; @@ -109,11 +111,8 @@ static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) num_buses = cfg->end_bus - cfg->start_bus + 1; size = PCI_MMCFG_BUS_OFFSET(num_buses); addr = ioremap_nocache(start, size); - if (addr) { - printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", - start, start + size - 1); + if (addr) addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus); - } return addr; } @@ -124,9 +123,8 @@ int __init pci_mmcfg_arch_init(void) list_for_each_entry(cfg, &pci_mmcfg_list, list) { cfg->virt = mcfg_ioremap(cfg); if (!cfg->virt) { - printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " - "segment %d\n", - cfg->segment); + printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n", + &cfg->res); pci_mmcfg_arch_free(); return 0; } -- cgit v1.2.2 From f6e1d8cc38b3776038fb15d3acc82ed8bb552f82 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Nov 2009 17:35:04 -0700 Subject: x86/PCI: MMCONFIG: add lookup function This patch factors out the search for an MMCONFIG region, which was previously implemented in both mmconfig_32 and mmconfig_64. No functional change. Reviewed-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/mmconfig-shared.c | 12 ++++++++++++ arch/x86/pci/mmconfig_32.c | 11 +++-------- arch/x86/pci/mmconfig_64.c | 23 ++++------------------- 4 files changed, 20 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 0b7c316a70c3..b4bf9a942ed0 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -134,6 +134,7 @@ struct pci_mmcfg_region { extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); extern struct list_head pci_mmcfg_list; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 71d69b88fa33..b19d1e54201e 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -97,6 +97,18 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, return new; } +struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) +{ + struct pci_mmcfg_region *cfg; + + list_for_each_entry(cfg, &pci_mmcfg_list, list) + if (cfg->segment == segment && + cfg->start_bus <= bus && bus <= cfg->end_bus) + return cfg; + + return NULL; +} + static const char __init *pci_mmcfg_e7520(void) { u32 win; diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index c04523e09649..90d5fd476ed4 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -27,15 +27,10 @@ static int mmcfg_last_accessed_cpu; */ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) { - struct pci_mmcfg_region *cfg; + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); - list_for_each_entry(cfg, &pci_mmcfg_list, list) - if (cfg->segment == seg && - (cfg->start_bus <= bus) && - (cfg->end_bus >= bus)) - return cfg->address; - - /* Fall back to type 0 */ + if (cfg) + return cfg->address; return 0; } diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index cfa6cdb6d262..e783841bd1d7 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -14,28 +14,13 @@ #define PREFIX "PCI: " -static char __iomem *get_virt(unsigned int seg, unsigned bus) -{ - struct pci_mmcfg_region *cfg; - - list_for_each_entry(cfg, &pci_mmcfg_list, list) - if (cfg->segment == seg && - (cfg->start_bus <= bus) && - (cfg->end_bus >= bus)) - return cfg->virt; - - /* Fall back to type 0 */ - return NULL; -} - static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { - char __iomem *addr; + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); - addr = get_virt(seg, bus); - if (!addr) - return NULL; - return addr + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12)); + if (cfg && cfg->virt) + return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12)); + return NULL; } static int pci_mmcfg_read(unsigned int seg, unsigned int bus, -- cgit v1.2.2 From 2263576cfc6e8f6ab038126c3254404b9fcb1c33 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 13 Nov 2009 10:06:08 +0800 Subject: ACPICA: Add post-order callback to acpi_walk_namespace The existing interface only has a pre-order callback. This change adds an additional parameter for a post-order callback which will be more useful for bus scans. ACPICA BZ 779. Also update the external calls to acpi_walk_namespace. http://www.acpica.org/bugzilla/show_bug.cgi?id=779 Signed-off-by: Lin Ming Signed-off-by: Bob Moore Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/longhaul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index cabd2fa3fc93..7e7eea4f8261 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) /* Find ACPI data for processor */ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, &longhaul_walk_callback, + ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, NULL, (void *)&pr); /* Check ACPI support for C3 state */ -- cgit v1.2.2 From 273bee27fa9f79d94b78c83506016f2e41e78983 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 25 Nov 2009 08:46:28 +0900 Subject: x86: Fix iommu=soft boot option iommu=soft boot option forces the kernel to use swiotlb. ( This has the side-effect of enabling the swiotlb over the GART if this boot option is provided. This is the desired behavior of the swiotlb boot option and works like that for all other hw-IOMMU drivers. ) Signed-off-by: FUJITA Tomonori Cc: yinghai@kernel.org LKML-Reference: <20091125084611O.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e36e71daa44c..e3c0a66b9e77 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -50,6 +50,8 @@ static struct dma_map_ops swiotlb_dma_ops = { */ int __init pci_swiotlb_init(void) { + int use_swiotlb = swiotlb | swiotlb_force; + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 if (!no_iommu && max_pfn > MAX_DMA32_PFN) @@ -63,5 +65,5 @@ int __init pci_swiotlb_init(void) dma_ops = &swiotlb_dma_ops; } - return swiotlb_force; + return use_swiotlb; } -- cgit v1.2.2 From 28b4e0d86acf59ae3bc422921138a4958458326e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Nov 2009 22:24:44 +0900 Subject: x86: Rename global percpu symbol dr7 to cpu_dr7 Percpu symbols now occupy the same namespace as other global symbols and as such short global symbols without subsystem prefix tend to collide with local variables. dr7 percpu variable used by x86 was hit by this. Rename it to cpu_dr7. The rename also makes it more consistent with its fellow cpu_debugreg percpu variable. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Rusty Russell Cc: Christoph Lameter Cc: Linus Torvalds , Cc: Andrew Morton LKML-Reference: <20091125115856.GA17856@elte.hu> Signed-off-by: Ingo Molnar Reported-by: Stephen Rothwell --- arch/x86/include/asm/debugreg.h | 4 ++-- arch/x86/kernel/hw_breakpoint.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index fdabd8435765..8240f76b531e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -75,7 +75,7 @@ */ #ifdef __KERNEL__ -DECLARE_PER_CPU(unsigned long, dr7); +DECLARE_PER_CPU(unsigned long, cpu_dr7); static inline void hw_breakpoint_disable(void) { @@ -91,7 +91,7 @@ static inline void hw_breakpoint_disable(void) static inline int hw_breakpoint_active(void) { - return __get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK; + return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void aout_dump_debugregs(struct user *dump); diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4d267fb77828..92ea5aad0b5c 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -46,8 +46,8 @@ #include /* Per cpu debug control register value */ -DEFINE_PER_CPU(unsigned long, dr7); -EXPORT_PER_CPU_SYMBOL(dr7); +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); /* Per cpu debug address registers values */ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); @@ -118,7 +118,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) set_debugreg(info->address, i); __get_cpu_var(cpu_debugreg[i]) = info->address; - dr7 = &__get_cpu_var(dr7); + dr7 = &__get_cpu_var(cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); @@ -153,7 +153,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; - dr7 = &__get_cpu_var(dr7); + dr7 = &__get_cpu_var(cpu_dr7); *dr7 &= ~encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); @@ -437,7 +437,7 @@ void hw_breakpoint_restore(void) set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); set_debugreg(current->thread.debugreg6, 6); - set_debugreg(__get_cpu_var(dr7), 7); + set_debugreg(__get_cpu_var(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); -- cgit v1.2.2 From b803090615ccec669681ff85ce28671e7bfefa3d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Nov 2009 08:17:31 +0100 Subject: x86: dumpstack: Clean up the x86_stack_ids[][] initalization and other details Make the initialization more readable, plus tidy up a few small visual details as well. No change in functionality. LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 9 ++++----- arch/x86/kernel/dumpstack_64.c | 25 +++++++++++++------------ 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f7dd2a7c3bf4..e0ed4c7abb62 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -10,9 +10,9 @@ #include #include #include +#include #include #include -#include #include @@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; + stack = &dummy; if (task && task != current) stack = (unsigned long *)task->thread.sp; @@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, - data, NULL, &graph); + bp = print_context_stack(context, stack, bp, ops, data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) @@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } - diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a071e6be177e..cfec478a4e42 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -10,26 +10,28 @@ #include #include #include +#include #include #include -#include #include #include "dumpstack.h" +#define N_EXCEPTION_STACKS_END \ + (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) static char x86_stack_ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", + [ DEBUG_STACK-1 ] = "#DB", + [ NMI_STACK-1 ] = "NMI", + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ STACKFAULT_STACK-1 ] = "#SS", + [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" + [ N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS_END ] = "#DB[?]" #endif - }; +}; int x86_is_stack_id(int id, char *name) { @@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name) } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) + unsigned *usedp, char **idp) { unsigned k; @@ -202,7 +204,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -303,4 +305,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } - -- cgit v1.2.2 From 67f2de0bf9141dd9fe9189d0caaa28d7ad21a523 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Nov 2009 08:29:10 +0100 Subject: x86: dumpstack, 64-bit: Disable preemption when walking the IRQ/exception stacks This warning: [ 847.140022] rb_producer D 0000000000000000 5928 519 2 0x00000000 [ 847.203627] BUG: using smp_processor_id() in preemptible [00000000] code: khungtaskd/517 [ 847.207360] caller is show_stack_log_lvl+0x2e/0x241 [ 847.210364] Pid: 517, comm: khungtaskd Not tainted 2.6.32-rc8-tip+ #13761 [ 847.213395] Call Trace: [ 847.215847] [] debug_smp_processor_id+0x1f0/0x20a [ 847.216809] [] show_stack_log_lvl+0x2e/0x241 [ 847.220027] [] show_stack+0x1c/0x1e [ 847.223365] [] sched_show_task+0xe4/0xe9 [ 847.226694] [] check_hung_task+0x140/0x199 [ 847.230261] [] check_hung_uninterruptible_tasks+0x1b7/0x20f [ 847.233371] [] ? watchdog+0x0/0x50 [ 847.236683] [] watchdog+0x4e/0x50 [ 847.240034] [] kthread+0x97/0x9f [ 847.243372] [] child_rip+0xa/0x20 [ 847.246690] [] ? restore_args+0x0/0x30 [ 847.250019] [] ? _spin_lock+0xe/0x10 [ 847.253351] [] ? kthread+0x0/0x9f [ 847.256833] [] ? child_rip+0x0/0x20 Happens because on preempt-RCU, khungd calls show_stack() with preemption enabled. Make sure we are not preemptible while walking the IRQ and exception stacks on 64-bit. (32-bit stack dumping is preemption safe.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index cfec478a4e42..8e740934bd1f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -206,19 +206,22 @@ void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { + unsigned long *irq_stack_end; + unsigned long *irq_stack; unsigned long *stack; + int cpu; int i; - const int cpu = smp_processor_id(); - unsigned long *irq_stack_end = - (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - unsigned long *irq_stack = - (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + + preempt_disable(); + cpu = smp_processor_id(); + + irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); /* - * debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu. + * Debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu: */ - if (sp == NULL) { if (task) sp = (unsigned long *)task->thread.sp; @@ -242,6 +245,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, printk(" %016lx", *stack++); touch_nmi_watchdog(); } + preempt_enable(); + printk("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -- cgit v1.2.2 From 2d4dc890b5c8fabd818a8586607e6843c4375e62 Mon Sep 17 00:00:00 2001 From: Ilya Loginov Date: Thu, 26 Nov 2009 09:16:19 +0100 Subject: block: add helpers to run flush_dcache_page() against a bio and a request's pages Mtdblock driver doesn't call flush_dcache_page for pages in request. So, this causes problems on architectures where the icache doesn't fill from the dcache or with dcache aliases. The patch fixes this. The ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE symbol was introduced to avoid pointless empty cache-thrashing loops on architectures for which flush_dcache_page() is a no-op. Every architecture was provided with this flush pages on architectires where ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is equal 1 or do nothing otherwise. See "fix mtd_blkdevs problem with caches on some architectures" discussion on LKML for more information. Signed-off-by: Ilya Loginov Cc: Ingo Molnar Cc: David Woodhouse Cc: Peter Horton Cc: "Ed L. Cashin" Signed-off-by: Jens Axboe --- arch/x86/include/asm/cacheflush.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index b54f6afe7ec4..9076add593a8 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { } static inline void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) { } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 static inline void flush_dcache_page(struct page *page) { } static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } -- cgit v1.2.2 From 605bfaee9078cd0b01d83402315389839ee4bb5c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 26 Nov 2009 05:35:42 +0100 Subject: hw-breakpoints: Simplify error handling in breakpoint creation requests This simplifies the error handling when we create a breakpoint. We don't need to check the NULL return value corner case anymore since we have improved perf_event_create_kernel_counter() to always return an error code in the failure case. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Steven Rostedt Cc: Prasad LKML-Reference: <1259210142-5714-3-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b25f8947ed7a..75e0cd847bd6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -657,10 +657,7 @@ restore: tsk, true); thread->ptrace_bps[i] = NULL; - if (!bp) { /* incorrect bp, or we have a bug in bp API */ - rc = -EINVAL; - break; - } + /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { rc = PTR_ERR(bp); bp = NULL; @@ -729,9 +726,6 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, tsk, bp->attr.disabled); } - - if (!bp) - return -EIO; /* * CHECKME: the previous code returned -EIO if the addr wasn't a * valid task virtual addr. The new one will return -EINVAL in this -- cgit v1.2.2 From 2c31b7958fd21df9fa04e5c36cda0f063ac70b27 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 26 Nov 2009 06:04:38 +0100 Subject: x86/hw-breakpoints: Don't lose GE flag while disabling a breakpoint When we schedule out a breakpoint from the cpu, we also incidentally remove the "Global exact breakpoint" flag from the breakpoint control register. It makes us losing the fine grained precision about the origin of the instructions that may trigger breakpoint exceptions for the other breakpoints running in this cpu. Reported-by: Prasad Signed-off-by: Frederic Weisbecker LKML-Reference: <1259211878-6013-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 92ea5aad0b5c..d42f65ac4927 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -59,21 +59,27 @@ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); -/* - * Encode the length, type, Exact, and Enable bits for a particular breakpoint - * as stored in debug register 7. - */ -unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) { unsigned long bp_info; bp_info = (len | type) & 0xf; bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); - bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) | - DR_GLOBAL_SLOWDOWN; + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + return bp_info; } +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + /* * Decode the length and type bits for a particular breakpoint as * stored in debug register 7. Return the "enabled" status. @@ -154,7 +160,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) return; dr7 = &__get_cpu_var(cpu_dr7); - *dr7 &= ~encode_dr7(i, info->len, info->type); + *dr7 &= ~__encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); } -- cgit v1.2.2 From 9b3660a55a9052518c91cc7c62d89e22f3f6f490 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 17 Nov 2009 18:22:16 -0600 Subject: x86: Limit number of per cpu TSC sync messages Limit the number of per cpu TSC sync messages by only printing to the console if an error occurs, otherwise print as a DEBUG message. The info message "Skipping synchronization ..." is only printed after the last cpu has booted. Signed-off-by: Mike Travis Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091118002222.181053000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index f37930954d15..eed156851f5d 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); + if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) + pr_info( + "Skipped synchronization checks as TSC is reliable.\n"); return; } - pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); - /* * Reset it - in case this is a second bootup: */ @@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu) cpu_relax(); if (nr_warps) { - printk("\n"); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { - printk(" passed.\n"); + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); } /* -- cgit v1.2.2 From 767df1bdd8cbff2c8c40c9ac8295bbdaa5fb24c4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 17:29:02 +0900 Subject: x86, mce: Add __cpuinit to hotplug callback functions The mce_disable_cpu() and mce_reenable_cpu() are called only from mce_cpu_callback() which is marked as __cpuinit. So these functions can be __cpuinit too. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen LKML-Reference: <4B0E3C4E.4090809@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5f277cad2ed7..0bcaa3875863 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1953,13 +1953,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void __cpuinit mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { @@ -1970,7 +1971,7 @@ static void mce_disable_cpu(void *h) } } -static void mce_reenable_cpu(void *h) +static void __cpuinit mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; -- cgit v1.2.2 From 79b0379cee09b00ef309384aff652e328e438c79 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 25 Nov 2009 14:18:26 -0500 Subject: x86: Optimize loadsegment() Zero the input register in the exception handler instead of using an extra register to pass in a zero value. Signed-off-by: Brian Gerst LKML-Reference: <1259176706-5908-1-git-send-email-brgerst@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/system.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 1a953e26401c..537395a2877a 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -156,18 +156,19 @@ extern void native_load_gs_index(unsigned); * segment if something goes wrong.. */ #define loadsegment(seg, value) \ +do { \ + unsigned short __val = value; \ asm volatile("\n" \ "1:\t" \ "movl %k0,%%" #seg "\n" \ - "2:\n" \ ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "movl %k1, %%" #seg "\n\t" \ - "jmp 2b\n" \ + "2:\t" \ + "xorl %k0,%k0\n\t" \ + "jmp 1b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b,3b) \ - : :"r" (value), "r" (0) : "memory") - + _ASM_EXTABLE(1b, 2b) \ + : "+r" (__val) : : "memory"); \ +} while (0) /* * Save a segment register away -- cgit v1.2.2 From 64b028b22616946a05bf9580f7f7f7ee2ac070b4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Nov 2009 10:37:55 +0100 Subject: x86: Clean up the loadsegment() macro Make it readable in the source too, not just in the assembly output. No change in functionality. Cc: Brian Gerst LKML-Reference: <1259176706-5908-1-git-send-email-brgerst@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/system.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 537395a2877a..022a84386de8 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -155,19 +155,21 @@ extern void native_load_gs_index(unsigned); * Load a segment. Fall back on loading the zero * segment if something goes wrong.. */ -#define loadsegment(seg, value) \ -do { \ - unsigned short __val = value; \ - asm volatile("\n" \ - "1:\t" \ - "movl %k0,%%" #seg "\n" \ - ".section .fixup,\"ax\"\n" \ - "2:\t" \ - "xorl %k0,%k0\n\t" \ - "jmp 1b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 2b) \ - : "+r" (__val) : : "memory"); \ +#define loadsegment(seg, value) \ +do { \ + unsigned short __val = (value); \ + \ + asm volatile(" \n" \ + "1: movl %k0,%%" #seg " \n" \ + \ + ".section .fixup,\"ax\" \n" \ + "2: xorl %k0,%k0 \n" \ + " jmp 1b \n" \ + ".previous \n" \ + \ + _ASM_EXTABLE(1b, 2b) \ + \ + : "+r" (__val) : : "memory"); \ } while (0) /* -- cgit v1.2.2 From 8ec6993d9f7d961014af970ded57542961fe9ad9 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 25 Nov 2009 11:17:36 -0500 Subject: x86, 64-bit: Set data segments to null after switching to 64-bit mode This prevents kernel threads from inheriting non-null segment selectors, and causing optimizations in __switch_to() to be ineffective. Signed-off-by: Brian Gerst Cc: Tim Blechmann Cc: Linus Torvalds Cc: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Jan Beulich LKML-Reference: <1259165856-3512-1-git-send-email-brgerst@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 26406601031c..17ba9ecf27c5 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -212,8 +212,8 @@ ENTRY(secondary_startup_64) */ lgdt early_gdt_descr(%rip) - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax + /* set up data segments */ + xorl %eax,%eax movl %eax,%ds movl %eax,%ss movl %eax,%es -- cgit v1.2.2 From 918bc960dc630b1a79c0d2991a81985812ff69f5 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Wed, 25 Nov 2009 10:20:19 -0600 Subject: x86: SGI UV: Map low MMR ranges Explicitly mmap the UV chipset MMR address ranges used to access blade-local registers. Although these same MMRs are also mmaped at higher addresses, the low range is more convenient when accessing blade-local registers. The low range addresses always alias to the local blade regardless of the blade id. Signed-off-by: Jack Steiner LKML-Reference: <20091125162018.GA25445@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5f5886a6b53..6d425490fb1f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -409,6 +409,12 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); } +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + static __init void uv_rtc_init(void) { long status; @@ -550,6 +556,8 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + map_low_mmrs(); + m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; -- cgit v1.2.2 From dd4377b02d9f028006beed1b7b1695ee5d1498b6 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 26 Nov 2009 19:53:48 +0800 Subject: x86/pat: Trivial: don't create debugfs for memtype if pat is disabled If pat is disabled (boot with nopat), there's no need to create debugfs for it, it's empty all the time. Signed-off-by: Xiaotian Feng Cc: Suresh Siddha Cc: Venkatesh Pallipadi Cc: H. Peter Anvin LKML-Reference: <1259236428-16329-1-git-send-email-dfeng@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ef712518b5b4..a81b7e73275d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -1019,8 +1019,10 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, - NULL, &memtype_fops); + if (pat_enabled) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } return 0; } -- cgit v1.2.2 From 5fa10b28e57f94a90535cfeafe89dcee9f47d540 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Nov 2009 04:55:53 +0100 Subject: hw-breakpoints: Use struct perf_event_attr to define user breakpoints In-kernel user breakpoints are created using functions in which we pass breakpoint parameters as individual variables: address, length and type. Although it fits well for x86, this just does not scale across archictectures that may support this api later as these may have more or different needs. Pass in a perf_event_attr structure instead because it is meant to evolve as much as possible into a generic hardware breakpoint parameter structure. Reported-by: K.Prasad Signed-off-by: Frederic Weisbecker LKML-Reference: <1259294154-5197-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 74 +++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 75e0cd847bd6..2941b32ea666 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -593,6 +593,34 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } +static struct perf_event * +ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + struct task_struct *tsk) +{ + int err; + int gen_len, gen_type; + DEFINE_BREAKPOINT_ATTR(attr); + + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ + if (!bp) + return ERR_PTR(-EINVAL); + + err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + if (err) + return ERR_PTR(err); + + attr = bp->attr; + attr.bp_len = gen_len; + attr.bp_type = gen_type; + attr.disabled = 0; + + return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); +} + /* * Handle ptrace writes to debug register 7. */ @@ -603,7 +631,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) int i, orig_ret = 0, rc = 0; int enabled, second_pass = 0; unsigned len, type; - int gen_len, gen_type; struct perf_event *bp; data &= ~DR_CONTROL_RESERVED; @@ -634,33 +661,12 @@ restore: continue; } - /* - * We shoud have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) { - rc = -EINVAL; - break; - } - - rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type); - if (rc) - break; - - /* - * This is a temporary thing as bp is unregistered/registered - * to simulate modification - */ - bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len, - gen_type, bp->callback, - tsk, true); - thread->ptrace_bps[i] = NULL; + bp = ptrace_modify_breakpoint(bp, len, type, tsk); /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { rc = PTR_ERR(bp); - bp = NULL; + thread->ptrace_bps[i] = NULL; break; } thread->ptrace_bps[i] = bp; @@ -707,24 +713,26 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, { struct perf_event *bp; struct thread_struct *t = &tsk->thread; + DEFINE_BREAKPOINT_ATTR(attr); if (!t->ptrace_bps[nr]) { /* * Put stub len and type to register (reserve) an inactive but * correct bp */ - bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1, - HW_BREAKPOINT_W, - ptrace_triggered, tsk, - false); + attr.bp_addr = addr; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); } else { bp = t->ptrace_bps[nr]; t->ptrace_bps[nr] = NULL; - bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len, - bp->attr.bp_type, - bp->callback, - tsk, - bp->attr.disabled); + + attr = bp->attr; + attr.bp_addr = addr; + bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } /* * CHECKME: the previous code returned -EIO if the addr wasn't a -- cgit v1.2.2 From 6a9401a7ac13e62ef2baf4d46e022d303edc3050 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 13:22:21 +0100 Subject: x86/amd-iommu: Separate internal interface definitions This patch moves all function declarations which are only used inside the driver code to a seperate header file. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu.h | 10 ++++----- arch/x86/include/asm/amd_iommu_proto.h | 38 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/amd_iommu_types.h | 5 ----- arch/x86/kernel/amd_iommu.c | 1 + arch/x86/kernel/amd_iommu_init.c | 1 + 5 files changed, 44 insertions(+), 11 deletions(-) create mode 100644 arch/x86/include/asm/amd_iommu_proto.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index b8ef2ee93643..089133899b3c 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -23,15 +23,13 @@ #include #ifdef CONFIG_AMD_IOMMU -extern int amd_iommu_init_dma_ops(void); -extern int amd_iommu_init_passthrough(void); + extern void amd_iommu_detect(void); -extern irqreturn_t amd_iommu_int_handler(int irq, void *data); -extern void amd_iommu_flush_all_domains(void); -extern void amd_iommu_flush_all_devices(void); -extern void amd_iommu_apply_erratum_63(u16 devid); + #else + static inline void amd_iommu_detect(void) { } + #endif #endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h new file mode 100644 index 000000000000..84786fb9a23b --- /dev/null +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2009 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_PROTO_H +#define _ASM_X86_AMD_IOMMU_PROTO_H + +struct amd_iommu; + +extern int amd_iommu_init_dma_ops(void); +extern int amd_iommu_init_passthrough(void); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_flush_all_domains(void); +extern void amd_iommu_flush_all_devices(void); +extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); + +#ifndef CONFIG_AMD_IOMMU_STATS + +static inline void amd_iommu_stats_init(void) { } + +#endif /* !CONFIG_AMD_IOMMU_STATS */ + +#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 2a2cc7a78a81..27db7f9c7aeb 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -462,11 +462,6 @@ struct __iommu_counter { #define ADD_STATS_COUNTER(name, x) #define SUB_STATS_COUNTER(name, x) -static inline void amd_iommu_stats_init(void) { } - #endif /* CONFIG_AMD_IOMMU_STATS */ -/* some function prototypes */ -extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); - #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b74b21247584..50d2b05a458b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 72bdbdac9b48..db30cfe86fce 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.2 From bf3118c1276d27fe9e84aa42382da25ee0750777 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 13:39:19 +0100 Subject: x86/amd-iommu: Update copyright headers This patch updates the copyright headers in the relevant AMD IOMMU driver files to match the date of the latest changes. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu.h | 2 +- arch/x86/include/asm/amd_iommu_types.h | 2 +- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 089133899b3c..5af2982133b5 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 27db7f9c7aeb..df5e9c8a856a 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 50d2b05a458b..7fe28be3b548 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index db30cfe86fce..cee11424d412 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * -- cgit v1.2.2 From bb52777ec4d736c2d7c4f037b32d4eeeb172ed89 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 14:31:51 +0100 Subject: x86/amd-iommu: Add an index field to struct amd_iommu This patch adds an index field to struct amd_iommu which can be used to lookup it up in an array. This index will be used in struct protection_domain to keep track which protection domain has devices behind which IOMMU. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 17 +++++++++++++++++ arch/x86/kernel/amd_iommu_init.c | 15 +++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index df5e9c8a856a..ab3e7bf1af71 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -24,6 +24,11 @@ #include #include +/* + * Maximum number of IOMMUs supported + */ +#define MAX_IOMMUS 32 + /* * some size calculation constants */ @@ -291,6 +296,9 @@ struct dma_ops_domain { struct amd_iommu { struct list_head list; + /* Index within the IOMMU array */ + int index; + /* locks the accesses to the hardware */ spinlock_t lock; @@ -356,6 +364,15 @@ struct amd_iommu { */ extern struct list_head amd_iommu_list; +/* + * Array with pointers to each IOMMU struct + * The indices are referenced in the protection domains + */ +extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; + +/* Number of IOMMUs present in the system */ +extern int amd_iommus_present; + /* * Structure defining one entry in the device table */ diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index cee11424d412..8567d1698027 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -137,6 +137,10 @@ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ +/* Array to assign indices to IOMMUs*/ +struct amd_iommu *amd_iommus[MAX_IOMMUS]; +int amd_iommus_present; + /* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains @@ -840,7 +844,18 @@ static void __init free_iommu_all(void) static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); + + /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); + iommu->index = amd_iommus_present++; + + if (unlikely(iommu->index >= MAX_IOMMUS)) { + WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n"); + return -ENOSYS; + } + + /* Index is fine - add IOMMU to the array */ + amd_iommus[iommu->index] = iommu; /* * Copy data from ACPI table entry to the iommu struct -- cgit v1.2.2 From c459611424d8b8396060eb766e23bd0c70c993bc Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 14:57:32 +0100 Subject: x86/amd-iommu: Add per IOMMU reference counting This patch adds reference counting for protection domains per IOMMU. This allows a smarter TLB flushing strategy. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 2 ++ arch/x86/kernel/amd_iommu.c | 12 +++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index ab3e7bf1af71..e68b14811380 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -238,7 +238,9 @@ struct protection_domain { unsigned long flags; /* flags to find out type of domain */ bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ + unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ void *priv; /* private data */ + }; /* diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 7fe28be3b548..8c38f0085403 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1175,7 +1175,9 @@ static void __attach_device(struct amd_iommu *iommu, /* update DTE entry */ set_dte_entry(devid, domain); - domain->dev_cnt += 1; + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; /* ready */ spin_unlock(&domain->lock); @@ -1209,6 +1211,9 @@ static void attach_device(struct amd_iommu *iommu, */ static void __detach_device(struct protection_domain *domain, u16 devid) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + BUG_ON(!iommu); /* lock domain */ spin_lock(&domain->lock); @@ -1223,8 +1228,9 @@ static void __detach_device(struct protection_domain *domain, u16 devid) amd_iommu_apply_erratum_63(devid); - /* decrease reference counter */ - domain->dev_cnt -= 1; + /* decrease reference counters */ + domain->dev_iommu[iommu->index] -= 1; + domain->dev_cnt -= 1; /* ready */ spin_unlock(&domain->lock); -- cgit v1.2.2 From 0518a3a4585cb3eeeaf14ca57131f11d252130c6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 16:00:05 +0100 Subject: x86/amd-iommu: Add function to complete a tlb flush This patch adds a function to the AMD IOMMU driver which completes all queued commands an all IOMMUs a specific domain has devices attached on. This is required in a later patch when per-domain flushing is implemented. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8c38f0085403..8fa5cc3e02d2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -376,6 +376,22 @@ out: return 0; } +static void iommu_flush_complete(struct protection_domain *domain) +{ + int i; + + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); + } +} + /* * Command send function for invalidating a device table entry */ @@ -1758,7 +1774,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, if (addr == DMA_ERROR_CODE) goto out; - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1791,7 +1807,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, __unmap_single(iommu, domain->priv, dma_addr, size, dir); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1863,7 +1879,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1914,7 +1930,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, s->dma_address = s->dma_length = 0; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1969,7 +1985,7 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out_free; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2010,7 +2026,7 @@ static void free_coherent(struct device *dev, size_t size, __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); -- cgit v1.2.2 From 6de8ad9b9ee0ec5b52ec8ec41401833e5e89186f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 18:30:32 +0100 Subject: x86/amd-iommu: Make iommu_flush_pages aware of multiple IOMMUs This patch extends the iommu_flush_pages function to flush the TLB entries on all IOMMUs the domain has devices on. This basically gives up the former assumption that dma_ops domains are only bound to one IOMMU in the system. For dma_ops domains this is still true but not for IOMMU-API managed domains. Giving this assumption up for dma_ops domains too allows code simplification. Further it splits out the main logic into a generic function which can be used by iommu_flush_tlb too. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8fa5cc3e02d2..7c06e574008f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -447,10 +447,10 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, - u64 address, size_t size) +static void __iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { - int s = 0; + int s = 0, i; unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -464,9 +464,26 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, s = 1; } - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); - return 0; + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need a TLB flush + */ + iommu_queue_inv_iommu_pages(amd_iommus[i], address, + domain->id, pde, s); + } + + return; +} + +static void iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size) +{ + __iommu_flush_pages(domain, address, size, 0); } /* Flush the whole IO/TLB for a given protection domain */ @@ -1683,7 +1700,7 @@ retry: iommu_flush_tlb(iommu, dma_dom->domain.id); dma_dom->need_flush = false; } else if (unlikely(iommu_has_npcache(iommu))) - iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + iommu_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1731,7 +1748,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, dma_addr, size); dma_dom->need_flush = false; } } -- cgit v1.2.2 From dcd1e92e405449ecc5e8bd8fcfebf3b2a13d3d37 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 15:30:58 +0100 Subject: x86/amd-iommu: Use __iommu_flush_pages for tlb flushes This patch re-implements iommu_flush_tlb functions to use the __iommu_flush_pages logic. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 7c06e574008f..c55aa079ded3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -451,7 +451,7 @@ static void __iommu_flush_pages(struct protection_domain *domain, u64 address, size_t size, int pde) { int s = 0, i; - unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); + unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -487,23 +487,15 @@ static void iommu_flush_pages(struct protection_domain *domain, } /* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_tlb(struct protection_domain *domain) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); - - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_tlb_pde(struct protection_domain *domain) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); - - iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } /* @@ -1236,7 +1228,7 @@ static void attach_device(struct amd_iommu *iommu, * here to evict all dirty stuff. */ iommu_queue_inv_dev_entry(iommu, devid); - iommu_flush_tlb_pde(iommu, domain->id); + iommu_flush_tlb_pde(domain); } /* @@ -1697,7 +1689,7 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(iommu, dma_dom->domain.id); + iommu_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; } else if (unlikely(iommu_has_npcache(iommu))) iommu_flush_pages(&dma_dom->domain, address, size); -- cgit v1.2.2 From 601367d76bd19b7eea2286ae99e5b1cb5d74f38d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 16:08:55 +0100 Subject: x86/amd-iommu: Remove iommu_flush_domain function This iommu_flush_tlb_pde function does essentially the same. So the iommu_flush_domain function is redundant and can be removed. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c55aa079ded3..b2c19f41f238 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -528,20 +528,6 @@ static void flush_all_domains_on_iommu(struct amd_iommu *iommu) } -/* - * This function is used to flush the IO/TLB for a given protection domain - * on every IOMMU in the system - */ -static void iommu_flush_domain(u16 domid) -{ - struct amd_iommu *iommu; - - INC_STATS_COUNTER(domain_flush_all); - - for_each_iommu(iommu) - flush_domain_on_iommu(iommu, domid); -} - void amd_iommu_flush_all_domains(void) { struct amd_iommu *iommu; @@ -1464,7 +1450,7 @@ static void update_domain(struct protection_domain *domain) update_device_table(domain); flush_devices_by_domain(domain); - iommu_flush_domain(domain->id); + iommu_flush_tlb_pde(domain); domain->updated = false; } @@ -2377,7 +2363,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova += PAGE_SIZE; } - iommu_flush_domain(domain->id); + iommu_flush_tlb_pde(domain); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, -- cgit v1.2.2 From aeb26f55337d4310840c8adc3ec7d6aebb714472 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 16:44:01 +0100 Subject: x86/amd-iommu: Implement protection domain list This patch adds code to keep a global list of all protection domains. This allows to simplify the resume code. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 7 +++++++ arch/x86/kernel/amd_iommu.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/kernel/amd_iommu_init.c | 8 ++++++++ 3 files changed, 48 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index e68b14811380..b332b7f7d8d6 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -231,6 +231,7 @@ extern bool amd_iommu_dump; * independent of their use. */ struct protection_domain { + struct list_head list; /* for list of all protection domains */ spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ @@ -375,6 +376,12 @@ extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; /* Number of IOMMUs present in the system */ extern int amd_iommus_present; +/* + * Declarations for the global list of all protection domains + */ +extern spinlock_t amd_iommu_pd_lock; +extern struct list_head amd_iommu_pd_list; + /* * Structure defining one entry in the device table */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b2c19f41f238..0c4319b13014 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -985,6 +985,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, * ****************************************************************************/ +/* + * This function adds a protection domain to the global protection domain list + */ +static void add_domain_to_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_add(&domain->list, &amd_iommu_pd_list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +/* + * This function removes a protection domain to the global + * protection domain list + */ +static void del_domain_from_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_del(&domain->list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + static u16 domain_id_alloc(void) { unsigned long flags; @@ -1073,6 +1098,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; + del_domain_from_list(&dom->domain); + free_pagetable(&dom->domain); for (i = 0; i < APERTURE_MAX_RANGES; ++i) { @@ -1113,6 +1140,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; + add_domain_to_list(&dma_dom->domain); + if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) goto free_dma_dom; @@ -2188,6 +2217,8 @@ static void protection_domain_free(struct protection_domain *domain) if (!domain) return; + del_domain_from_list(domain); + if (domain->id) domain_id_free(domain->id); @@ -2207,6 +2238,8 @@ static struct protection_domain *protection_domain_alloc(void) if (!domain->id) goto out_err; + add_domain_to_list(domain); + return domain; out_err: diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8567d1698027..73d5173765d2 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -141,6 +141,12 @@ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the struct amd_iommu *amd_iommus[MAX_IOMMUS]; int amd_iommus_present; +/* + * List of protection domains - used during resume + */ +LIST_HEAD(amd_iommu_pd_list); +spinlock_t amd_iommu_pd_lock; + /* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains @@ -1263,6 +1269,8 @@ static int __init amd_iommu_init(void) */ amd_iommu_pd_alloc_bitmap[0] = 1; + spin_lock_init(&amd_iommu_pd_lock); + /* * now the data structures are allocated and basically initialized * start the real acpi table scan -- cgit v1.2.2 From e3306664eb307ae4cc93211cd9f12d0dbd49de65 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 16:48:58 +0100 Subject: x86/amd-iommu: Reimplement amd_iommu_flush_all_domains() This patch reimplementes the amd_iommu_flush_all_domains function to use the global protection domain list instead of flushing every domain on every IOMMU. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0c4319b13014..5141f5608c5c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -530,10 +530,12 @@ static void flush_all_domains_on_iommu(struct amd_iommu *iommu) void amd_iommu_flush_all_domains(void) { - struct amd_iommu *iommu; + struct protection_domain *domain; - for_each_iommu(iommu) - flush_all_domains_on_iommu(iommu); + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + iommu_flush_tlb_pde(domain); + iommu_flush_complete(domain); + } } static void flush_all_devices_for_iommu(struct amd_iommu *iommu) -- cgit v1.2.2 From 09b4280439ef6fdc55f1353a9135034336eb5d26 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 17:02:44 +0100 Subject: x86/amd-iommu: Reimplement flush_all_domains_on_iommu() This patch reimplements the function flush_all_domains_on_iommu to use the global protection domain list. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 5141f5608c5c..a1bd99d390ab 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -499,43 +499,48 @@ static void iommu_flush_tlb_pde(struct protection_domain *domain) } /* - * This function flushes one domain on one IOMMU + * This function flushes all domains that have devices on the given IOMMU */ -static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +static void flush_all_domains_on_iommu(struct amd_iommu *iommu) { - struct iommu_cmd cmd; + u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + struct protection_domain *domain; unsigned long flags; - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); - - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); -} - -static void flush_all_domains_on_iommu(struct amd_iommu *iommu) -{ - int i; + spin_lock_irqsave(&amd_iommu_pd_lock, flags); - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + if (domain->dev_iommu[iommu->index] == 0) continue; - flush_domain_on_iommu(iommu, i); + + spin_lock(&domain->lock); + iommu_queue_inv_iommu_pages(iommu, address, domain->id, 1, 1); + iommu_flush_complete(domain); + spin_unlock(&domain->lock); } + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } +/* + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. + */ void amd_iommu_flush_all_domains(void) { struct protection_domain *domain; + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); list_for_each_entry(domain, &amd_iommu_pd_list, list) { + spin_lock(&domain->lock); iommu_flush_tlb_pde(domain); iommu_flush_complete(domain); + spin_unlock(&domain->lock); } + + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } static void flush_all_devices_for_iommu(struct amd_iommu *iommu) -- cgit v1.2.2 From 318afd41d2eca3224de3fd85a3b9a27a3010a98d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 18:32:38 +0100 Subject: x86/amd-iommu: Make np-cache a global flag The non-present cache flag was IOMMU local until now which doesn't make sense. Make this a global flag so we can remove the lase user of 'struct iommu' in the map/unmap path. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 3 +++ arch/x86/kernel/amd_iommu.c | 8 +------- arch/x86/kernel/amd_iommu_init.c | 6 ++++++ 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index b332b7f7d8d6..4899f783df68 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -211,6 +211,9 @@ extern bool amd_iommu_dump; printk(KERN_INFO "AMD-Vi: " format, ## arg); \ } while(0); +/* global flag if IOMMUs cache non-present entries */ +extern bool amd_iommu_np_cache; + /* * Make iterating over all IOMMUs easier */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a1bd99d390ab..5ebd24e4fc57 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -131,12 +131,6 @@ static void amd_iommu_stats_init(void) #endif -/* returns !0 if the IOMMU is caching non-present entries in its TLB */ -static int iommu_has_npcache(struct amd_iommu *iommu) -{ - return iommu->cap & (1UL << IOMMU_CAP_NPCACHE); -} - /**************************************************************************** * * Interrupt handling functions @@ -1713,7 +1707,7 @@ retry: if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { iommu_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; - } else if (unlikely(iommu_has_npcache(iommu))) + } else if (unlikely(amd_iommu_np_cache)) iommu_flush_pages(&dma_dom->domain, address, size); out: diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 73d5173765d2..fbe4c3c02a91 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -141,6 +141,9 @@ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the struct amd_iommu *amd_iommus[MAX_IOMMUS]; int amd_iommus_present; +/* IOMMUs have a non-present cache? */ +bool amd_iommu_np_cache __read_mostly; + /* * List of protection domains - used during resume */ @@ -891,6 +894,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) + amd_iommu_np_cache = true; + return pci_enable_device(iommu->dev); } -- cgit v1.2.2 From 420aef8a3acfc3e75427107e23d5a9bafd17c477 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 16:14:57 +0100 Subject: x86/amd-iommu: Use check_device for amd_iommu_dma_supported The check_device logic needs to include the dma_supported checks to be really sure. Merge the dma_supported logic into check_device and use it to implement dma_supported. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 5ebd24e4fc57..ac27b1d6bd12 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1375,9 +1375,27 @@ static struct notifier_block device_nb = { */ static bool check_device(struct device *dev) { + u16 bdf; + struct pci_dev *pcidev; + if (!dev || !dev->dma_mask) return false; + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return false; + + pcidev = to_pci_dev(dev); + + bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + + /* Out of our scope? */ + if (bdf > amd_iommu_last_bdf) + return false; + + if (amd_iommu_rlookup_table[bdf] == NULL) + return false; + return true; } @@ -2065,22 +2083,7 @@ free_mem: */ static int amd_iommu_dma_supported(struct device *dev, u64 mask) { - u16 bdf; - struct pci_dev *pcidev; - - /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) - return 0; - - pcidev = to_pci_dev(dev); - - bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* Out of our scope? */ - if (bdf > amd_iommu_last_bdf) - return 0; - - return 1; + return check_device(dev); } /* -- cgit v1.2.2 From f99c0f1c75f75924a6f19cb40a21ccefc6e8754d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 16:52:56 +0100 Subject: x86/amd-iommu: Use check_device in get_device_resources Every call-place of get_device_resources calls check_device before it. So call it from get_device_resources directly and simplify the code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 86 +++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index ac27b1d6bd12..c5102ebdcbd9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1432,35 +1432,24 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) * If the device is not yet associated with a domain this is also done * in this function. */ -static int get_device_resources(struct device *dev, - struct amd_iommu **iommu, - struct protection_domain **domain, - u16 *bdf) +static bool get_device_resources(struct device *dev, + struct amd_iommu **iommu, + struct protection_domain **domain, + u16 *bdf) { struct dma_ops_domain *dma_dom; struct pci_dev *pcidev; u16 _bdf; - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; - - if (dev->bus != &pci_bus_type) - return 0; - - pcidev = to_pci_dev(dev); - _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* device not translated by any IOMMU in the system? */ - if (_bdf > amd_iommu_last_bdf) - return 0; - - *bdf = amd_iommu_alias_table[_bdf]; + if (!check_device(dev)) + return false; - *iommu = amd_iommu_rlookup_table[*bdf]; - if (*iommu == NULL) - return 0; + pcidev = to_pci_dev(dev); + _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + *bdf = amd_iommu_alias_table[_bdf]; + *iommu = amd_iommu_rlookup_table[*bdf]; *domain = domain_for_device(*bdf); + if (*domain == NULL) { dma_dom = find_protection_domain(*bdf); if (!dma_dom) @@ -1474,7 +1463,7 @@ static int get_device_resources(struct device *dev, if (domain_for_device(_bdf) == NULL) attach_device(*iommu, *domain, _bdf); - return 1; + return true; } static void update_device_table(struct protection_domain *domain) @@ -1797,17 +1786,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page, INC_STATS_COUNTER(cnt_map_single); - if (!check_device(dev)) - return DMA_ERROR_CODE; - - dma_mask = *dev->dma_mask; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (iommu == NULL || domain == NULL) + if (!get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; + dma_mask = *dev->dma_mask; + if (!dma_ops_domain(domain)) return DMA_ERROR_CODE; @@ -1838,8 +1822,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, INC_STATS_COUNTER(cnt_unmap_single); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; @@ -1893,16 +1876,11 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!check_device(dev)) - return 0; + if (!get_device_resources(dev, &iommu, &domain, &devid)) + return map_sg_no_iommu(dev, sglist, nelems, dir); dma_mask = *dev->dma_mask; - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - return map_sg_no_iommu(dev, sglist, nelems, dir); - if (!dma_ops_domain(domain)) return 0; @@ -1958,8 +1936,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_unmap_sg); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &iommu, &domain, &devid)) return; if (!dma_ops_domain(domain)) @@ -1994,24 +1971,22 @@ static void *alloc_coherent(struct device *dev, size_t size, INC_STATS_COUNTER(cnt_alloc_coherent); - if (!check_device(dev)) - return NULL; + if (!get_device_resources(dev, &iommu, &domain, &devid)) { + virt_addr = (void *)__get_free_pages(flag, get_order(size)); + *dma_addr = __pa(virt_addr); + return virt_addr; + } - if (!get_device_resources(dev, &iommu, &domain, &devid)) - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + dma_mask = dev->coherent_dma_mask; + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + flag |= __GFP_ZERO; - flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return NULL; paddr = virt_to_phys(virt_addr); - if (!iommu || !domain) { - *dma_addr = (dma_addr_t)paddr; - return virt_addr; - } - if (!dma_ops_domain(domain)) goto out_free; @@ -2054,12 +2029,7 @@ static void free_coherent(struct device *dev, size_t size, INC_STATS_COUNTER(cnt_free_coherent); - if (!check_device(dev)) - return; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) + if (!get_device_resources(dev, &iommu, &domain, &devid)) goto free_mem; if (!dma_ops_domain(domain)) -- cgit v1.2.2 From 680525e06ddccda8c51bdddf532cd5b7d950c411 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 18:44:42 +0100 Subject: x86/amd-iommu: Remove iommu parameter from dma_ops_domain_(un)map The parameter is unused in these function so remove it from the parameter list. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c5102ebdcbd9..da3f9d8ee395 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1585,8 +1585,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ -static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, unsigned long address, phys_addr_t paddr, int direction) @@ -1620,8 +1619,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, /* * The generic unmapping function for on page in the DMA address space. */ -static void dma_ops_domain_unmap(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static void dma_ops_domain_unmap(struct dma_ops_domain *dom, unsigned long address) { struct aperture_range *aperture; @@ -1700,7 +1698,7 @@ retry: start = address; for (i = 0; i < pages; ++i) { - ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + ret = dma_ops_domain_map(dma_dom, start, paddr, dir); if (ret == DMA_ERROR_CODE) goto out_unmap; @@ -1724,7 +1722,7 @@ out_unmap: for (--i; i >= 0; --i) { start -= PAGE_SIZE; - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); } dma_ops_free_addresses(dma_dom, address, pages); @@ -1754,7 +1752,7 @@ static void __unmap_single(struct amd_iommu *iommu, start = dma_addr; for (i = 0; i < pages; ++i) { - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); start += PAGE_SIZE; } -- cgit v1.2.2 From 576175c2503ae9b0f930ee9a6a0abaf7ef8956ad Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 19:08:46 +0100 Subject: x86/amd-iommu: Make alloc_new_range aware of multiple IOMMUs Since the assumption that an dma_ops domain is only bound to one IOMMU was given up we need to make alloc_new_range aware of it. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index da3f9d8ee395..687f617b95d7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -788,11 +788,11 @@ static u64 *fetch_pte(struct protection_domain *domain, * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + struct amd_iommu *iommu; int i; #ifdef CONFIG_IOMMU_STRESS @@ -832,14 +832,17 @@ static int alloc_new_range(struct amd_iommu *iommu, dma_dom->aperture_size += APERTURE_RANGE_SIZE; /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start >= dma_dom->aperture[index]->offset && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); + for_each_iommu(iommu) { + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset + && iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + startpage = iommu->exclusion_start >> PAGE_SHIFT; + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } } /* @@ -1143,7 +1146,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) add_domain_to_list(&dma_dom->domain); - if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -1686,7 +1689,7 @@ retry: */ dma_dom->next_address = dma_dom->aperture_size; - if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) goto out; /* -- cgit v1.2.2 From cd8c82e875c27ee0d8b59fb76bc12aa9db6a70c2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 19:33:56 +0100 Subject: x86/amd-iommu: Remove iommu parameter from __(un)map_single With the prior changes this parameter is not longer required. This patch removes it from the function and all callers. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 687f617b95d7..c04dcb7f40b2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1653,7 +1653,6 @@ static void dma_ops_domain_unmap(struct dma_ops_domain *dom, * Must be called with the domain lock held. */ static dma_addr_t __map_single(struct device *dev, - struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, @@ -1737,8 +1736,7 @@ out_unmap: * Does the reverse of the __map_single function. Must be called with * the domain lock held too */ -static void __unmap_single(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static void __unmap_single(struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, size_t size, int dir) @@ -1797,7 +1795,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, return DMA_ERROR_CODE; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + addr = __map_single(dev, domain->priv, paddr, size, dir, false, dma_mask); if (addr == DMA_ERROR_CODE) goto out; @@ -1832,7 +1830,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, dir); + __unmap_single(domain->priv, dma_addr, size, dir); iommu_flush_complete(domain); @@ -1890,7 +1888,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, for_each_sg(sglist, s, nelems, i) { paddr = sg_phys(s); - s->dma_address = __map_single(dev, iommu, domain->priv, + s->dma_address = __map_single(dev, domain->priv, paddr, s->length, dir, false, dma_mask); @@ -1910,7 +1908,7 @@ out: unmap: for_each_sg(sglist, s, mapped_elems, i) { if (s->dma_address) - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } @@ -1946,7 +1944,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } @@ -1996,7 +1994,7 @@ static void *alloc_coherent(struct device *dev, size_t size, spin_lock_irqsave(&domain->lock, flags); - *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + *dma_addr = __map_single(dev, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); if (*dma_addr == DMA_ERROR_CODE) { @@ -2038,7 +2036,7 @@ static void free_coherent(struct device *dev, size_t size, spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); iommu_flush_complete(domain); -- cgit v1.2.2 From f3be07da531ceef1b51295e5becc9bc07670b671 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 19:43:14 +0100 Subject: x86/amd-iommu: Remove iommu specific handling from dma_ops path This patch finishes the removal of all iommu specific handling code in the dma_ops path. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c04dcb7f40b2..2cd5800e6888 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1436,11 +1436,11 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) * in this function. */ static bool get_device_resources(struct device *dev, - struct amd_iommu **iommu, struct protection_domain **domain, u16 *bdf) { struct dma_ops_domain *dma_dom; + struct amd_iommu *iommu; struct pci_dev *pcidev; u16 _bdf; @@ -1450,21 +1450,21 @@ static bool get_device_resources(struct device *dev, pcidev = to_pci_dev(dev); _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); *bdf = amd_iommu_alias_table[_bdf]; - *iommu = amd_iommu_rlookup_table[*bdf]; + iommu = amd_iommu_rlookup_table[*bdf]; *domain = domain_for_device(*bdf); if (*domain == NULL) { dma_dom = find_protection_domain(*bdf); if (!dma_dom) - dma_dom = (*iommu)->default_dom; + dma_dom = iommu->default_dom; *domain = &dma_dom->domain; - attach_device(*iommu, *domain, *bdf); + attach_device(iommu, *domain, *bdf); DUMP_printk("Using protection domain %d for device %s\n", (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) - attach_device(*iommu, *domain, _bdf); + attach_device(iommu, *domain, _bdf); return true; } @@ -1776,7 +1776,6 @@ static dma_addr_t map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; dma_addr_t addr; @@ -1785,7 +1784,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, INC_STATS_COUNTER(cnt_map_single); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; @@ -1815,13 +1814,12 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; INC_STATS_COUNTER(cnt_unmap_single); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; @@ -1864,7 +1862,6 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; int i; @@ -1875,7 +1872,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) return map_sg_no_iommu(dev, sglist, nelems, dir); dma_mask = *dev->dma_mask; @@ -1927,7 +1924,6 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; struct scatterlist *s; u16 devid; @@ -1935,7 +1931,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_unmap_sg); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) return; if (!dma_ops_domain(domain)) @@ -1962,7 +1958,6 @@ static void *alloc_coherent(struct device *dev, size_t size, { unsigned long flags; void *virt_addr; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; phys_addr_t paddr; @@ -1970,7 +1965,7 @@ static void *alloc_coherent(struct device *dev, size_t size, INC_STATS_COUNTER(cnt_alloc_coherent); - if (!get_device_resources(dev, &iommu, &domain, &devid)) { + if (!get_device_resources(dev, &domain, &devid)) { virt_addr = (void *)__get_free_pages(flag, get_order(size)); *dma_addr = __pa(virt_addr); return virt_addr; @@ -2022,13 +2017,12 @@ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; INC_STATS_COUNTER(cnt_free_coherent); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) goto free_mem; if (!dma_ops_domain(domain)) -- cgit v1.2.2 From 15898bbcb48fc86c2baff156163df0941ecb6a15 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 15:39:42 +0100 Subject: x86/amd-iommu: Let domain_for_device handle aliases If there is no domain associated to a device yet and the device has an alias device which already has a domain, the original device needs to have the same domain as the alias device. This patch changes domain_for_device to handle this situation and directly assigns the alias device domain to the device in this situation. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 227 ++++++++++++++++++++++++++------------------ 1 file changed, 135 insertions(+), 92 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 2cd5800e6888..75470ffee358 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -71,6 +71,19 @@ static u64 *fetch_pte(struct protection_domain *domain, unsigned long address, int map_size); static void update_domain(struct protection_domain *domain); +/**************************************************************************** + * + * Helper functions + * + ****************************************************************************/ + +static inline u16 get_device_id(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return calc_devid(pdev->bus->number, pdev->devfn); +} + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1174,26 +1187,13 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -/* - * Find out the protection domain structure for a given PCI device. This - * will give us the pointer to the page table root for example. - */ -static struct protection_domain *domain_for_device(u16 devid) -{ - struct protection_domain *dom; - unsigned long flags; - - read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = amd_iommu_pd_table[devid]; - read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - return dom; -} - static void set_dte_entry(u16 devid, struct protection_domain *domain) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; u64 pte_root = virt_to_phys(domain->pt_root); + BUG_ON(amd_iommu_pd_table[devid] != NULL); + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -1203,42 +1203,87 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); amd_iommu_pd_table[devid] = domain; + + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; + + /* Flush the changes DTE entry */ + iommu_queue_inv_dev_entry(iommu, devid); +} + +static void clear_dte_entry(u16 devid) +{ + struct protection_domain *domain = amd_iommu_pd_table[devid]; + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + BUG_ON(domain == NULL); + + /* remove domain from the lookup table */ + amd_iommu_pd_table[devid] = NULL; + + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; + + amd_iommu_apply_erratum_63(devid); + + /* decrease reference counters */ + domain->dev_iommu[iommu->index] -= 1; + domain->dev_cnt -= 1; + + iommu_queue_inv_dev_entry(iommu, devid); } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void __attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int __attach_device(struct device *dev, + struct protection_domain *domain) { + u16 devid = get_device_id(dev); + u16 alias = amd_iommu_alias_table[devid]; + /* lock domain */ spin_lock(&domain->lock); - /* update DTE entry */ - set_dte_entry(devid, domain); + /* Some sanity checks */ + if (amd_iommu_pd_table[alias] != NULL && + amd_iommu_pd_table[alias] != domain) + return -EBUSY; - /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; + if (amd_iommu_pd_table[devid] != NULL && + amd_iommu_pd_table[devid] != domain) + return -EBUSY; + + /* Do real assignment */ + if (alias != devid && + amd_iommu_pd_table[alias] == NULL) + set_dte_entry(alias, domain); + + if (amd_iommu_pd_table[devid] == NULL) + set_dte_entry(devid, domain); /* ready */ spin_unlock(&domain->lock); + + return 0; } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int attach_device(struct device *dev, + struct protection_domain *domain) { unsigned long flags; + int ret; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __attach_device(iommu, domain, devid); + ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* @@ -1246,62 +1291,70 @@ static void attach_device(struct amd_iommu *iommu, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_queue_inv_dev_entry(iommu, devid); iommu_flush_tlb_pde(domain); + + return ret; } /* * Removes a device from a protection domain (unlocked) */ -static void __detach_device(struct protection_domain *domain, u16 devid) +static void __detach_device(struct device *dev) { + u16 devid = get_device_id(dev); struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; BUG_ON(!iommu); - /* lock domain */ - spin_lock(&domain->lock); - - /* remove domain from the lookup table */ - amd_iommu_pd_table[devid] = NULL; - - /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; - - amd_iommu_apply_erratum_63(devid); - - /* decrease reference counters */ - domain->dev_iommu[iommu->index] -= 1; - domain->dev_cnt -= 1; - - /* ready */ - spin_unlock(&domain->lock); + clear_dte_entry(devid); /* * If we run in passthrough mode the device must be assigned to the * passthrough domain if it is detached from any other domain */ - if (iommu_pass_through) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - __attach_device(iommu, pt_domain, devid); - } + if (iommu_pass_through) + __attach_device(dev, pt_domain); } /* * Removes a device from a protection domain (with devtable_lock held) */ -static void detach_device(struct protection_domain *domain, u16 devid) +static void detach_device(struct device *dev) { unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(domain, devid); + __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ +static struct protection_domain *domain_for_device(struct device *dev) +{ + struct protection_domain *dom; + unsigned long flags; + u16 devid, alias; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + dom = amd_iommu_pd_table[devid]; + if (dom == NULL && + amd_iommu_pd_table[alias] != NULL) { + __attach_device(dev, amd_iommu_pd_table[alias]); + dom = amd_iommu_pd_table[devid]; + } + + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return dom; +} + static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -1322,7 +1375,7 @@ static int device_change_notifier(struct notifier_block *nb, if (iommu == NULL) goto out; - domain = domain_for_device(devid); + domain = domain_for_device(dev); if (domain && !dma_ops_domain(domain)) WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " @@ -1334,7 +1387,7 @@ static int device_change_notifier(struct notifier_block *nb, goto out; if (iommu_pass_through) break; - detach_device(domain, devid); + detach_device(dev); break; case BUS_NOTIFY_ADD_DEVICE: /* allocate a protection domain if a device is added */ @@ -1441,30 +1494,25 @@ static bool get_device_resources(struct device *dev, { struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - struct pci_dev *pcidev; - u16 _bdf; if (!check_device(dev)) return false; - pcidev = to_pci_dev(dev); - _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - *bdf = amd_iommu_alias_table[_bdf]; + *bdf = get_device_id(dev); + *domain = domain_for_device(dev); iommu = amd_iommu_rlookup_table[*bdf]; - *domain = domain_for_device(*bdf); - if (*domain == NULL) { - dma_dom = find_protection_domain(*bdf); - if (!dma_dom) - dma_dom = iommu->default_dom; - *domain = &dma_dom->domain; - attach_device(iommu, *domain, *bdf); - DUMP_printk("Using protection domain %d for device %s\n", - (*domain)->id, dev_name(dev)); - } + if (*domain != NULL) + return true; - if (domain_for_device(_bdf) == NULL) - attach_device(iommu, *domain, _bdf); + /* Device not bount yet - bind it */ + dma_dom = find_protection_domain(*bdf); + if (!dma_dom) + dma_dom = iommu->default_dom; + *domain = &dma_dom->domain; + attach_device(dev, *domain); + DUMP_printk("Using protection domain %d for device %s\n", + (*domain)->id, dev_name(dev)); return true; } @@ -2068,7 +2116,7 @@ static void prealloc_protection_domains(void) if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; - if (domain_for_device(devid)) + if (domain_for_device(&dev->dev)) continue; iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -2079,9 +2127,7 @@ static void prealloc_protection_domains(void) init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; - attach_device(iommu, &dma_dom->domain, devid); - if (__devid != devid) - attach_device(iommu, &dma_dom->domain, __devid); + attach_device(&dev->dev, &dma_dom->domain); list_add_tail(&dma_dom->list, &iommu_pd_list); } @@ -2174,7 +2220,7 @@ static void cleanup_domain(struct protection_domain *domain) for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) if (amd_iommu_pd_table[devid] == domain) - __detach_device(domain, devid); + clear_dte_entry(devid); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } @@ -2262,7 +2308,6 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { - struct protection_domain *domain = dom->priv; struct amd_iommu *iommu; struct pci_dev *pdev; u16 devid; @@ -2275,7 +2320,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, devid = calc_devid(pdev->bus->number, pdev->devfn); if (devid > 0) - detach_device(domain, devid); + detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -2292,6 +2337,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct protection_domain *old_domain; struct amd_iommu *iommu; struct pci_dev *pdev; + int ret; u16 devid; if (dev->bus != &pci_bus_type) @@ -2309,15 +2355,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, if (!iommu) return -EINVAL; - old_domain = domain_for_device(devid); + old_domain = amd_iommu_pd_table[devid]; if (old_domain) - detach_device(old_domain, devid); + detach_device(dev); - attach_device(iommu, domain, devid); + ret = attach_device(dev, domain); iommu_completion_wait(iommu); - return 0; + return ret; } static int amd_iommu_map_range(struct iommu_domain *dom, @@ -2414,8 +2460,9 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { + struct amd_iommu *iommu; struct pci_dev *dev = NULL; - u16 devid, devid2; + u16 devid; /* allocate passthroug domain */ pt_domain = protection_domain_alloc(); @@ -2425,20 +2472,16 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - struct amd_iommu *iommu; devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; - devid2 = amd_iommu_alias_table[devid]; - - iommu = amd_iommu_rlookup_table[devid2]; + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - __attach_device(iommu, pt_domain, devid); - __attach_device(iommu, pt_domain, devid2); + attach_device(&dev->dev, pt_domain); } pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); -- cgit v1.2.2 From 94f6d190eeed91cb2bb901aa7816edd1e2405347 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 16:40:02 +0100 Subject: x86/amd-iommu: Simplify get_device_resources() With the previous changes the get_device_resources function can be simplified even more. The only important information for the callers is the protection domain. This patch renames the function to get_domain() and let it only return the protection domain for a device. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 86 +++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 75470ffee358..e5bbe9a0c192 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1463,6 +1463,7 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) { struct dma_ops_domain *entry, *ret = NULL; unsigned long flags; + u16 alias = amd_iommu_alias_table[devid]; if (list_empty(&iommu_pd_list)) return NULL; @@ -1470,7 +1471,8 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) spin_lock_irqsave(&iommu_pd_list_lock, flags); list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid) { + if (entry->target_dev == devid || + entry->target_dev == alias) { ret = entry; break; } @@ -1488,33 +1490,31 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) * If the device is not yet associated with a domain this is also done * in this function. */ -static bool get_device_resources(struct device *dev, - struct protection_domain **domain, - u16 *bdf) +static struct protection_domain *get_domain(struct device *dev) { + struct protection_domain *domain; struct dma_ops_domain *dma_dom; - struct amd_iommu *iommu; + u16 devid = get_device_id(dev); if (!check_device(dev)) - return false; + return ERR_PTR(-EINVAL); - *bdf = get_device_id(dev); - *domain = domain_for_device(dev); - iommu = amd_iommu_rlookup_table[*bdf]; + domain = domain_for_device(dev); + if (domain != NULL && !dma_ops_domain(domain)) + return ERR_PTR(-EBUSY); - if (*domain != NULL) - return true; + if (domain != NULL) + return domain; /* Device not bount yet - bind it */ - dma_dom = find_protection_domain(*bdf); + dma_dom = find_protection_domain(devid); if (!dma_dom) - dma_dom = iommu->default_dom; - *domain = &dma_dom->domain; - attach_device(dev, *domain); + dma_dom = amd_iommu_rlookup_table[devid]->default_dom; + attach_device(dev, &dma_dom->domain); DUMP_printk("Using protection domain %d for device %s\n", - (*domain)->id, dev_name(dev)); + dma_dom->domain.id, dev_name(dev)); - return true; + return &dma_dom->domain; } static void update_device_table(struct protection_domain *domain) @@ -1825,23 +1825,22 @@ static dma_addr_t map_page(struct device *dev, struct page *page, { unsigned long flags; struct protection_domain *domain; - u16 devid; dma_addr_t addr; u64 dma_mask; phys_addr_t paddr = page_to_phys(page) + offset; INC_STATS_COUNTER(cnt_map_single); - if (!get_device_resources(dev, &domain, &devid)) - /* device not handled by any AMD IOMMU */ + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) return (dma_addr_t)paddr; + else if (IS_ERR(domain)) + return DMA_ERROR_CODE; dma_mask = *dev->dma_mask; - if (!dma_ops_domain(domain)) - return DMA_ERROR_CODE; - spin_lock_irqsave(&domain->lock, flags); + addr = __map_single(dev, domain->priv, paddr, size, dir, false, dma_mask); if (addr == DMA_ERROR_CODE) @@ -1863,15 +1862,11 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, { unsigned long flags; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_unmap_single); - if (!get_device_resources(dev, &domain, &devid)) - /* device not handled by any AMD IOMMU */ - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); @@ -1911,7 +1906,6 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, { unsigned long flags; struct protection_domain *domain; - u16 devid; int i; struct scatterlist *s; phys_addr_t paddr; @@ -1920,14 +1914,14 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!get_device_resources(dev, &domain, &devid)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) return map_sg_no_iommu(dev, sglist, nelems, dir); + else if (IS_ERR(domain)) + return 0; dma_mask = *dev->dma_mask; - if (!dma_ops_domain(domain)) - return 0; - spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1974,15 +1968,12 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, unsigned long flags; struct protection_domain *domain; struct scatterlist *s; - u16 devid; int i; INC_STATS_COUNTER(cnt_unmap_sg); - if (!get_device_resources(dev, &domain, &devid)) - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); @@ -2007,17 +1998,18 @@ static void *alloc_coherent(struct device *dev, size_t size, unsigned long flags; void *virt_addr; struct protection_domain *domain; - u16 devid; phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; INC_STATS_COUNTER(cnt_alloc_coherent); - if (!get_device_resources(dev, &domain, &devid)) { + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) { virt_addr = (void *)__get_free_pages(flag, get_order(size)); *dma_addr = __pa(virt_addr); return virt_addr; - } + } else if (IS_ERR(domain)) + return NULL; dma_mask = dev->coherent_dma_mask; flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); @@ -2029,9 +2021,6 @@ static void *alloc_coherent(struct device *dev, size_t size, paddr = virt_to_phys(virt_addr); - if (!dma_ops_domain(domain)) - goto out_free; - if (!dma_mask) dma_mask = *dev->dma_mask; @@ -2066,14 +2055,11 @@ static void free_coherent(struct device *dev, size_t size, { unsigned long flags; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_free_coherent); - if (!get_device_resources(dev, &domain, &devid)) - goto free_mem; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) goto free_mem; spin_lock_irqsave(&domain->lock, flags); -- cgit v1.2.2 From 71c70984e5afc20d304fbb523f1c8bb42c4ceb36 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 16:43:06 +0100 Subject: x86/amd-iommu: Move find_protection_domain to helper functions This is a helper function and when its placed in the helper function section we can remove its forward declaration. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 57 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index e5bbe9a0c192..405f8dad7c77 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -59,7 +59,6 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); -static struct dma_ops_domain *find_protection_domain(u16 devid); static u64 *alloc_pte(struct protection_domain *domain, unsigned long address, int end_lvl, u64 **pte_page, gfp_t gfp); @@ -84,6 +83,34 @@ static inline u16 get_device_id(struct device *dev) return calc_devid(pdev->bus->number, pdev->devfn); } +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + u16 alias = amd_iommu_alias_table[devid]; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid || + entry->target_dev == alias) { + ret = entry; + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1455,34 +1482,6 @@ static bool check_device(struct device *dev) return true; } -/* - * In this function the list of preallocated protection domains is traversed to - * find the domain for a specific device - */ -static struct dma_ops_domain *find_protection_domain(u16 devid) -{ - struct dma_ops_domain *entry, *ret = NULL; - unsigned long flags; - u16 alias = amd_iommu_alias_table[devid]; - - if (list_empty(&iommu_pd_list)) - return NULL; - - spin_lock_irqsave(&iommu_pd_list_lock, flags); - - list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid || - entry->target_dev == alias) { - ret = entry; - break; - } - } - - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - return ret; -} - /* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the -- cgit v1.2.2 From 98fc5a693bbdda498a556654c70d1e31a186c988 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 17:19:23 +0100 Subject: x86/amd-iommu: Use get_device_id and check_device where appropriate The logic of these two functions is reimplemented (at least in parts) in places in the code. This patch removes these code duplications and uses the functions instead. As a side effect it moves check_device() to the helper function code section. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 110 ++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 405f8dad7c77..d10195b685a7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -111,6 +111,33 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) return ret; } +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + u16 devid; + + if (!dev || !dev->dma_mask) + return false; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return false; + + devid = get_device_id(dev); + + /* Out of our scope? */ + if (devid > amd_iommu_last_bdf) + return false; + + if (amd_iommu_rlookup_table[devid] == NULL) + return false; + + return true; +} + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1386,22 +1413,17 @@ static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct pci_dev *pdev = to_pci_dev(dev); - u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + u16 devid; struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; unsigned long flags; - if (devid > amd_iommu_last_bdf) - goto out; - - devid = amd_iommu_alias_table[devid]; - - iommu = amd_iommu_rlookup_table[devid]; - if (iommu == NULL) - goto out; + if (!check_device(dev)) + return 0; + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; domain = domain_for_device(dev); if (domain && !dma_ops_domain(domain)) @@ -1452,36 +1474,6 @@ static struct notifier_block device_nb = { * *****************************************************************************/ -/* - * This function checks if the driver got a valid device from the caller to - * avoid dereferencing invalid pointers. - */ -static bool check_device(struct device *dev) -{ - u16 bdf; - struct pci_dev *pcidev; - - if (!dev || !dev->dma_mask) - return false; - - /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) - return false; - - pcidev = to_pci_dev(dev); - - bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* Out of our scope? */ - if (bdf > amd_iommu_last_bdf) - return false; - - if (amd_iommu_rlookup_table[bdf] == NULL) - return false; - - return true; -} - /* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the @@ -2094,15 +2086,20 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - u16 devid, __devid; + u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - __devid = devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) + + /* Do we handle this device? */ + if (!check_device(&dev->dev)) continue; - devid = amd_iommu_alias_table[devid]; + + /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; + + devid = get_device_id(&dev->dev); + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; @@ -2294,17 +2291,14 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { struct amd_iommu *iommu; - struct pci_dev *pdev; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return; - pdev = to_pci_dev(dev); - - devid = calc_devid(pdev->bus->number, pdev->devfn); + devid = get_device_id(dev); - if (devid > 0) + if (amd_iommu_pd_table[devid] != NULL) detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; @@ -2321,20 +2315,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct protection_domain *domain = dom->priv; struct protection_domain *old_domain; struct amd_iommu *iommu; - struct pci_dev *pdev; int ret; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return -EINVAL; - pdev = to_pci_dev(dev); - - devid = calc_devid(pdev->bus->number, pdev->devfn); - - if (devid >= amd_iommu_last_bdf || - devid != amd_iommu_alias_table[devid]) - return -EINVAL; + devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -2458,10 +2445,11 @@ int __init amd_iommu_init_passthrough(void) while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) + if (!check_device(&dev->dev)) continue; + devid = get_device_id(&dev->dev); + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; -- cgit v1.2.2 From 87a64d523825351a23743e69949c2a8c2077cecf Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 17:26:43 +0100 Subject: x86/amd-iommu: Remove iommu parameter from dma_ops_domain_alloc This function doesn't use the parameter anymore so it can be removed. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d10195b685a7..17e83ecb8b22 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1188,7 +1188,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) +static struct dma_ops_domain *dma_ops_domain_alloc(void) { struct dma_ops_domain *dma_dom; @@ -1443,7 +1443,7 @@ static int device_change_notifier(struct notifier_block *nb, dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu); + dma_domain = dma_ops_domain_alloc(); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -2085,7 +2085,6 @@ static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; - struct amd_iommu *iommu; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -2100,10 +2099,7 @@ static void prealloc_protection_domains(void) devid = get_device_id(&dev->dev); - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) - continue; - dma_dom = dma_ops_domain_alloc(iommu); + dma_dom = dma_ops_domain_alloc(); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); @@ -2139,7 +2135,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ for_each_iommu(iommu) { - iommu->default_dom = dma_ops_domain_alloc(iommu); + iommu->default_dom = dma_ops_domain_alloc(); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; -- cgit v1.2.2 From 308973d3b958b9328a1051642c81ee6dbc5021a4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 17:43:32 +0100 Subject: x86/amd-iommu: Move some pte allocation functions in the right section This patch moves alloc_pte() and fetch_pte() into the page table handling code section so that the forward declarations for them could be removed. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 193 +++++++++++++++++++++----------------------- 1 file changed, 94 insertions(+), 99 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 17e83ecb8b22..90b365024c24 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -59,15 +59,10 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, int end_lvl, - u64 **pte_page, gfp_t gfp); static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages); static void reset_iommu_command_buffer(struct amd_iommu *iommu); -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size); static void update_domain(struct protection_domain *domain); /**************************************************************************** @@ -664,6 +659,100 @@ void amd_iommu_flush_all_devices(void) * ****************************************************************************/ +/* + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. + */ +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) +{ + u64 *pte; + + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; + + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; +} + +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) +{ + u64 *pte, *page; + int level; + + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); + + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } + + level -= 1; + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page && level == end_lvl) + *pte_page = pte; + + pte = &pte[PM_LEVEL_INDEX(level, address)]; + } + + return pte; +} + +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) +{ + int level; + u64 *pte; + + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + level -= 1; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; + + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } + + return pte; +} + /* * Generic mapping functions. It maps a physical address into a DMA * address space. It allocates the page table pages if necessary. @@ -819,37 +908,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size) -{ - int level; - u64 *pte; - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > map_size) { - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - - if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { - pte = NULL; - break; - } - } - - return pte; -} - /* * This function is used to add a new aperture range to an existing * aperture in case of dma_ops domain allocation or address allocation @@ -1534,69 +1592,6 @@ static void update_domain(struct protection_domain *domain) domain->updated = false; } -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - gfp_t gfp) -{ - u64 *pte; - - if (domain->mode == PAGE_MODE_6_LEVEL) - /* address space already 64 bit large */ - return false; - - pte = (void *)get_zeroed_page(gfp); - if (!pte) - return false; - - *pte = PM_LEVEL_PDE(domain->mode, - virt_to_phys(domain->pt_root)); - domain->pt_root = pte; - domain->mode += 1; - domain->updated = true; - - return true; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - int end_lvl, - u64 **pte_page, - gfp_t gfp) -{ - u64 *pte, *page; - int level; - - while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > end_lvl) { - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); - } - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - /* * This function fetches the PTE for a given address in the aperture */ -- cgit v1.2.2 From 171e7b3739e175eea7b32eca9dbe189589e14a28 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 17:47:56 +0100 Subject: x86/amd-iommu: Rearrange dma_ops related functions This patch rearranges two dma_ops related functions so that their forward declarations are not longer necessary. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 89 +++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 90b365024c24..14b60c0cdc70 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -57,11 +57,6 @@ struct iommu_cmd { u32 data[4]; }; -static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, - struct unity_map_entry *e); -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages); static void reset_iommu_command_buffer(struct amd_iommu *iommu); static void update_domain(struct protection_domain *domain); @@ -822,28 +817,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, return 0; } -/* - * Init the unity mappings for a specific IOMMU in the system - * - * Basically iterates over all unity mapping entries and applies them to - * the default domain DMA of that IOMMU if necessary. - */ -static int iommu_init_unity_mappings(struct amd_iommu *iommu) -{ - struct unity_map_entry *entry; - int ret; - - list_for_each_entry(entry, &amd_iommu_unity_map, list) { - if (!iommu_for_unity_map(iommu, entry)) - continue; - ret = dma_ops_unity_map(iommu->default_dom, entry); - if (ret) - return ret; - } - - return 0; -} - /* * This function actually applies the mapping to the page table of the * dma_ops domain. @@ -872,6 +845,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, return 0; } +/* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ +static int iommu_init_unity_mappings(struct amd_iommu *iommu) +{ + struct unity_map_entry *entry; + int ret; + + list_for_each_entry(entry, &amd_iommu_unity_map, list) { + if (!iommu_for_unity_map(iommu, entry)) + continue; + ret = dma_ops_unity_map(iommu->default_dom, entry); + if (ret) + return ret; + } + + return 0; +} + /* * Inits the unity mappings required for a specific device */ @@ -908,6 +903,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ +/* + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. + */ +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages) +{ + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; + + if (start_page + pages > last_page) + pages = last_page - start_page; + + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); + } +} + /* * This function is used to add a new aperture range to an existing * aperture in case of dma_ops domain allocation or address allocation @@ -1166,26 +1181,6 @@ static void domain_id_free(int id) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -/* - * Used to reserve address ranges in the aperture (e.g. for exclusion - * ranges. - */ -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages) -{ - unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - - if (start_page + pages > last_page) - pages = last_page - start_page; - - for (i = start_page; i < start_page + pages; ++i) { - int index = i / APERTURE_RANGE_PAGES; - int page = i % APERTURE_RANGE_PAGES; - __set_bit(page, dom->aperture[index]->bitmap); - } -} - static void free_pagetable(struct protection_domain *domain) { int i, j; -- cgit v1.2.2 From 8793abeb783c12cc37f92f6133fd6468152b98df Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 27 Nov 2009 11:40:33 +0100 Subject: x86/amd-iommu: Remove support for domain sharing This patch makes device isolation mandatory and removes support for the amd_iommu=share option. This simplifies the code in several places. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 3 --- arch/x86/kernel/amd_iommu.c | 10 ++-------- arch/x86/kernel/amd_iommu_init.c | 17 ----------------- 3 files changed, 2 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 4899f783df68..02b6a0fd863c 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -451,9 +451,6 @@ extern struct protection_domain **amd_iommu_pd_table; /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; -/* will be 1 if device isolation is enabled */ -extern bool amd_iommu_isolate; - /* * If true, the addresses will be flushed on unmap time, not when * they are reused diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 14b60c0cdc70..ed58a1688391 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -148,7 +148,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); static struct dentry *stats_dir; -static struct dentry *de_isolate; static struct dentry *de_fflush; static void amd_iommu_stats_add(struct __iommu_counter *cnt) @@ -166,9 +165,6 @@ static void amd_iommu_stats_init(void) if (stats_dir == NULL) return; - de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, - (u32 *)&amd_iommu_isolate); - de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, (u32 *)&amd_iommu_unmap_flush); @@ -2135,11 +2131,9 @@ int __init amd_iommu_init_dma_ops(void) } /* - * If device isolation is enabled, pre-allocate the protection - * domains for each device. + * Pre-allocate the protection domains for each device. */ - if (amd_iommu_isolate) - prealloc_protection_domains(); + prealloc_protection_domains(); iommu_detected = 1; swiotlb = 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fbe4c3c02a91..fe1686f6f91b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -125,13 +125,6 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -#ifdef CONFIG_IOMMU_STRESS -bool amd_iommu_isolate = false; -#else -bool amd_iommu_isolate = true; /* if true, device isolation is - enabled */ -#endif - bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -1308,12 +1301,6 @@ static int __init amd_iommu_init(void) if (iommu_pass_through) goto out; - printk(KERN_INFO "AMD-Vi: device isolation "); - if (amd_iommu_isolate) - printk("enabled\n"); - else - printk("disabled\n"); - if (amd_iommu_unmap_flush) printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else @@ -1387,10 +1374,6 @@ static int __init parse_amd_iommu_dump(char *str) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = true; - if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } -- cgit v1.2.2 From 657cbb6b6cba0f9c98c5299e0c803b2c0e67ea0a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 15:26:46 +0100 Subject: x86/amd-iommu: Use dev->arch->iommu to store iommu related information This patch changes IOMMU code to use dev->archdata->iommu to store information about the alias device and the domain the device is attached to. This allows the driver to get rid of the amd_iommu_pd_table in the future. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 8 +++ arch/x86/include/asm/device.h | 2 +- arch/x86/kernel/amd_iommu.c | 109 ++++++++++++++++++++++++++------- 3 files changed, 95 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 02b6a0fd863c..9eaa27b46860 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -247,6 +247,14 @@ struct protection_domain { }; +/* + * This struct contains device specific data for the IOMMU + */ +struct iommu_dev_data { + struct device *alias; /* The Alias Device */ + struct protection_domain *domain; /* Domain the device is bound to */ +}; + /* * For dynamic growth the aperture size is split into ranges of 128MB of * DMA address space each. This struct represents one such range. diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index cee34e9ca45b..029f230ab637 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -8,7 +8,7 @@ struct dev_archdata { #ifdef CONFIG_X86_64 struct dma_map_ops *dma_ops; #endif -#ifdef CONFIG_DMAR +#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU) void *iommu; /* hook for IOMMU specific extension */ #endif }; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index ed58a1688391..3214e8806f95 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -73,6 +73,11 @@ static inline u16 get_device_id(struct device *dev) return calc_devid(pdev->bus->number, pdev->devfn); } +static struct iommu_dev_data *get_dev_data(struct device *dev) +{ + return dev->archdata.iommu; +} + /* * In this function the list of preallocated protection domains is traversed to * find the domain for a specific device @@ -128,6 +133,35 @@ static bool check_device(struct device *dev) return true; } +static int iommu_init_device(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct pci_dev *pdev; + u16 devid, alias; + + if (dev->archdata.iommu) + return 0; + + dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); + if (pdev) + dev_data->alias = &pdev->dev; + + dev->archdata.iommu = dev_data; + + + return 0; +} + +static void iommu_uninit_device(struct device *dev) +{ + kfree(dev->archdata.iommu); +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1346,28 +1380,39 @@ static void clear_dte_entry(u16 devid) static int __attach_device(struct device *dev, struct protection_domain *domain) { - u16 devid = get_device_id(dev); - u16 alias = amd_iommu_alias_table[devid]; + struct iommu_dev_data *dev_data, *alias_data; + u16 devid, alias; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + if (!alias_data) + return -EINVAL; /* lock domain */ spin_lock(&domain->lock); /* Some sanity checks */ - if (amd_iommu_pd_table[alias] != NULL && - amd_iommu_pd_table[alias] != domain) + if (alias_data->domain != NULL && + alias_data->domain != domain) return -EBUSY; - if (amd_iommu_pd_table[devid] != NULL && - amd_iommu_pd_table[devid] != domain) + if (dev_data->domain != NULL && + dev_data->domain != domain) return -EBUSY; /* Do real assignment */ if (alias != devid && - amd_iommu_pd_table[alias] == NULL) + alias_data->domain == NULL) { + alias_data->domain = domain; set_dte_entry(alias, domain); + } - if (amd_iommu_pd_table[devid] == NULL) + if (dev_data->domain == NULL) { + dev_data->domain = domain; set_dte_entry(devid, domain); + } /* ready */ spin_unlock(&domain->lock); @@ -1406,10 +1451,12 @@ static void __detach_device(struct device *dev) { u16 devid = get_device_id(dev); struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + struct iommu_dev_data *dev_data = get_dev_data(dev); BUG_ON(!iommu); clear_dte_entry(devid); + dev_data->domain = NULL; /* * If we run in passthrough mode the device must be assigned to the @@ -1439,18 +1486,23 @@ static void detach_device(struct device *dev) static struct protection_domain *domain_for_device(struct device *dev) { struct protection_domain *dom; + struct iommu_dev_data *dev_data, *alias_data; unsigned long flags; u16 devid, alias; - devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + if (!alias_data) + return NULL; read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = amd_iommu_pd_table[devid]; + dom = dev_data->domain; if (dom == NULL && - amd_iommu_pd_table[alias] != NULL) { - __attach_device(dev, amd_iommu_pd_table[alias]); - dom = amd_iommu_pd_table[devid]; + alias_data->domain != NULL) { + __attach_device(dev, alias_data->domain); + dom = alias_data->domain; } read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); @@ -1473,14 +1525,12 @@ static int device_change_notifier(struct notifier_block *nb, devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - domain = domain_for_device(dev); - - if (domain && !dma_ops_domain(domain)) - WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " - "to a non-dma-ops domain\n", dev_name(dev)); switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: + + domain = domain_for_device(dev); + if (!domain) goto out; if (iommu_pass_through) @@ -1488,6 +1538,11 @@ static int device_change_notifier(struct notifier_block *nb, detach_device(dev); break; case BUS_NOTIFY_ADD_DEVICE: + + iommu_init_device(dev); + + domain = domain_for_device(dev); + /* allocate a protection domain if a device is added */ dma_domain = find_protection_domain(devid); if (dma_domain) @@ -1502,6 +1557,10 @@ static int device_change_notifier(struct notifier_block *nb, spin_unlock_irqrestore(&iommu_pd_list_lock, flags); break; + case BUS_NOTIFY_DEL_DEVICE: + + iommu_uninit_device(dev); + default: goto out; } @@ -2079,6 +2138,8 @@ static void prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; + iommu_init_device(&dev->dev); + /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; @@ -2270,6 +2331,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { + struct iommu_dev_data *dev_data = dev->archdata.iommu; struct amd_iommu *iommu; u16 devid; @@ -2278,7 +2340,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, devid = get_device_id(dev); - if (amd_iommu_pd_table[devid] != NULL) + if (dev_data->domain != NULL) detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; @@ -2293,7 +2355,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { struct protection_domain *domain = dom->priv; - struct protection_domain *old_domain; + struct iommu_dev_data *dev_data; struct amd_iommu *iommu; int ret; u16 devid; @@ -2301,14 +2363,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, if (!check_device(dev)) return -EINVAL; + dev_data = dev->archdata.iommu; + devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return -EINVAL; - old_domain = amd_iommu_pd_table[devid]; - if (old_domain) + if (dev_data->domain) detach_device(dev); ret = attach_device(dev, domain); -- cgit v1.2.2 From 241000556f751dacd332df6ab2e903a23746e51e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Nov 2009 15:59:57 +0100 Subject: x86/amd-iommu: Add device bind reference counting This patch adds a reference count to each device to count how often the device was bound to that domain. This is important for single devices that act as an alias for a number of others. These devices must stay bound to their domains until all devices that alias to it are unbound from the same domain. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 1 + arch/x86/kernel/amd_iommu.c | 37 ++++++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 9eaa27b46860..434e90ed89c5 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -253,6 +253,7 @@ struct protection_domain { struct iommu_dev_data { struct device *alias; /* The Alias Device */ struct protection_domain *domain; /* Domain the device is bound to */ + atomic_t bind; /* Domain attach reverent count */ }; /* diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 3214e8806f95..f5db7d5e444e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -152,6 +152,8 @@ static int iommu_init_device(struct device *dev) if (pdev) dev_data->alias = &pdev->dev; + atomic_set(&dev_data->bind, 0); + dev->archdata.iommu = dev_data; @@ -1403,10 +1405,13 @@ static int __attach_device(struct device *dev, return -EBUSY; /* Do real assignment */ - if (alias != devid && - alias_data->domain == NULL) { - alias_data->domain = domain; - set_dte_entry(alias, domain); + if (alias != devid) { + if (alias_data->domain == NULL) { + alias_data->domain = domain; + set_dte_entry(alias, domain); + } + + atomic_inc(&alias_data->bind); } if (dev_data->domain == NULL) { @@ -1414,6 +1419,8 @@ static int __attach_device(struct device *dev, set_dte_entry(devid, domain); } + atomic_inc(&dev_data->bind); + /* ready */ spin_unlock(&domain->lock); @@ -1449,20 +1456,34 @@ static int attach_device(struct device *dev, */ static void __detach_device(struct device *dev) { - u16 devid = get_device_id(dev); + u16 devid = get_device_id(dev), alias; struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; struct iommu_dev_data *dev_data = get_dev_data(dev); + struct iommu_dev_data *alias_data; BUG_ON(!iommu); - clear_dte_entry(devid); - dev_data->domain = NULL; + devid = get_device_id(dev); + alias = get_device_id(dev_data->alias); + + if (devid != alias) { + alias_data = get_dev_data(dev_data->alias); + if (atomic_dec_and_test(&alias_data->bind)) { + clear_dte_entry(alias); + alias_data->domain = NULL; + } + } + + if (atomic_dec_and_test(&dev_data->bind)) { + clear_dte_entry(devid); + dev_data->domain = NULL; + } /* * If we run in passthrough mode the device must be assigned to the * passthrough domain if it is detached from any other domain */ - if (iommu_pass_through) + if (iommu_pass_through && dev_data->domain == NULL) __attach_device(dev, pt_domain); } -- cgit v1.2.2 From 7c392cbe984d904f7c89a6a75b2ac245254e8da5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 11:13:32 +0100 Subject: x86/amd-iommu: Keep devices per domain in a list This patch introduces a list to each protection domain which keeps all devices associated with the domain. This can be used later to optimize certain functions and to completly remove the amd_iommu_pd_table. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 2 ++ arch/x86/kernel/amd_iommu.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 434e90ed89c5..93953d1922c4 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -235,6 +235,7 @@ extern bool amd_iommu_np_cache; */ struct protection_domain { struct list_head list; /* for list of all protection domains */ + struct list_head dev_list; /* List of all devices in this domain */ spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ @@ -251,6 +252,7 @@ struct protection_domain { * This struct contains device specific data for the IOMMU */ struct iommu_dev_data { + struct list_head list; /* For domain->dev_list */ struct device *alias; /* The Alias Device */ struct protection_domain *domain; /* Domain the device is bound to */ atomic_t bind; /* Domain attach reverent count */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f5db7d5e444e..530d6080940f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1286,6 +1286,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(void) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; + INIT_LIST_HEAD(&dma_dom->domain.dev_list); dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; @@ -1408,6 +1409,7 @@ static int __attach_device(struct device *dev, if (alias != devid) { if (alias_data->domain == NULL) { alias_data->domain = domain; + list_add(&alias_data->list, &domain->dev_list); set_dte_entry(alias, domain); } @@ -1416,6 +1418,7 @@ static int __attach_device(struct device *dev, if (dev_data->domain == NULL) { dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); set_dte_entry(devid, domain); } @@ -1460,6 +1463,7 @@ static void __detach_device(struct device *dev) struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; + unsigned long flags; BUG_ON(!iommu); @@ -1469,13 +1473,19 @@ static void __detach_device(struct device *dev) if (devid != alias) { alias_data = get_dev_data(dev_data->alias); if (atomic_dec_and_test(&alias_data->bind)) { + spin_lock_irqsave(&alias_data->domain->lock, flags); clear_dte_entry(alias); + list_del(&alias_data->list); + spin_unlock_irqrestore(&alias_data->domain->lock, flags); alias_data->domain = NULL; } } if (atomic_dec_and_test(&dev_data->bind)) { + spin_lock_irqsave(&dev_data->domain->lock, flags); clear_dte_entry(devid); + list_del(&dev_data->list); + spin_unlock_irqrestore(&dev_data->domain->lock, flags); dev_data->domain = NULL; } @@ -2294,6 +2304,7 @@ static struct protection_domain *protection_domain_alloc(void) domain->id = domain_id_alloc(); if (!domain->id) goto out_err; + INIT_LIST_HEAD(&domain->dev_list); add_domain_to_list(domain); -- cgit v1.2.2 From 7f760ddd702d162d693bc79f62c3bdd7fe55bd9d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 14:49:59 +0100 Subject: x86/amd-iommu: Cleanup attach/detach_device code This patch cleans up the attach_device and detach_device paths and fixes reference counting while at it. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 102 +++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 530d6080940f..e3363fd5eef5 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1329,7 +1329,6 @@ static bool dma_ops_domain(struct protection_domain *domain) static void set_dte_entry(u16 devid, struct protection_domain *domain) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; u64 pte_root = virt_to_phys(domain->pt_root); BUG_ON(amd_iommu_pd_table[devid] != NULL); @@ -1344,18 +1343,11 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_pd_table[devid] = domain; - /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; - - /* Flush the changes DTE entry */ - iommu_queue_inv_dev_entry(iommu, devid); } static void clear_dte_entry(u16 devid) { struct protection_domain *domain = amd_iommu_pd_table[devid]; - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; BUG_ON(domain == NULL); @@ -1368,11 +1360,51 @@ static void clear_dte_entry(u16 devid) amd_iommu_dev_table[devid].data[2] = 0; amd_iommu_apply_erratum_63(devid); +} + +static void do_attach(struct device *dev, struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + set_dte_entry(devid, domain); + + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; + + /* Flush the DTE entry */ + iommu_queue_inv_dev_entry(iommu, devid); +} + +static void do_detach(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); /* decrease reference counters */ - domain->dev_iommu[iommu->index] -= 1; - domain->dev_cnt -= 1; + dev_data->domain->dev_iommu[iommu->index] -= 1; + dev_data->domain->dev_cnt -= 1; + + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + clear_dte_entry(devid); + /* Flush the DTE entry */ iommu_queue_inv_dev_entry(iommu, devid); } @@ -1384,12 +1416,10 @@ static int __attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data, *alias_data; - u16 devid, alias; - devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); + if (!alias_data) return -EINVAL; @@ -1406,21 +1436,16 @@ static int __attach_device(struct device *dev, return -EBUSY; /* Do real assignment */ - if (alias != devid) { - if (alias_data->domain == NULL) { - alias_data->domain = domain; - list_add(&alias_data->list, &domain->dev_list); - set_dte_entry(alias, domain); - } + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (alias_data->domain == NULL) + do_attach(dev_data->alias, domain); atomic_inc(&alias_data->bind); } - if (dev_data->domain == NULL) { - dev_data->domain = domain; - list_add(&dev_data->list, &domain->dev_list); - set_dte_entry(devid, domain); - } + if (dev_data->domain == NULL) + do_attach(dev, domain); atomic_inc(&dev_data->bind); @@ -1459,35 +1484,24 @@ static int attach_device(struct device *dev, */ static void __detach_device(struct device *dev) { - u16 devid = get_device_id(dev), alias; - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; unsigned long flags; - BUG_ON(!iommu); + BUG_ON(!dev_data->domain); - devid = get_device_id(dev); - alias = get_device_id(dev_data->alias); + spin_lock_irqsave(&dev_data->domain->lock, flags); - if (devid != alias) { + if (dev_data->alias != dev) { alias_data = get_dev_data(dev_data->alias); - if (atomic_dec_and_test(&alias_data->bind)) { - spin_lock_irqsave(&alias_data->domain->lock, flags); - clear_dte_entry(alias); - list_del(&alias_data->list); - spin_unlock_irqrestore(&alias_data->domain->lock, flags); - alias_data->domain = NULL; - } + if (atomic_dec_and_test(&alias_data->bind)) + do_detach(dev_data->alias); } - if (atomic_dec_and_test(&dev_data->bind)) { - spin_lock_irqsave(&dev_data->domain->lock, flags); - clear_dte_entry(devid); - list_del(&dev_data->list); - spin_unlock_irqrestore(&dev_data->domain->lock, flags); - dev_data->domain = NULL; - } + if (atomic_dec_and_test(&dev_data->bind)) + do_detach(dev); + + spin_unlock_irqrestore(&dev_data->domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the -- cgit v1.2.2 From 3fa43655d81d471d47c44b0db4e2be1f8af32207 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 15:04:38 +0100 Subject: x86/amd-iommu: Introduce iommu_flush_device() function This patch adds a function to flush a DTE entry for a given struct device and replaces iommu_queue_inv_dev_entry calls with this function where appropriate. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index e3363fd5eef5..41c4ebecced4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -494,6 +494,17 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) return ret; } +static int iommu_flush_device(struct device *dev) +{ + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + + return iommu_queue_inv_dev_entry(iommu, devid); +} + static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, u16 domid, int pde, int s) { @@ -1382,7 +1393,7 @@ static void do_attach(struct device *dev, struct protection_domain *domain) domain->dev_cnt += 1; /* Flush the DTE entry */ - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); } static void do_detach(struct device *dev) @@ -1405,7 +1416,7 @@ static void do_detach(struct device *dev) clear_dte_entry(devid); /* Flush the DTE entry */ - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); } /* @@ -1610,7 +1621,7 @@ static int device_change_notifier(struct notifier_block *nb, goto out; } - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); out: @@ -2393,7 +2404,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, if (!iommu) return; - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); } -- cgit v1.2.2 From b00d3bcff4d996f65e337d404b0df5dc201a01ab Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 15:35:33 +0100 Subject: x86/amd-iommu: Cleanup DTE flushing code This patch cleans up the code to flush device table entries in the IOMMU. With this chance the driver can get rid of the iommu_queue_inv_dev_entry() function. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 1 + arch/x86/kernel/amd_iommu.c | 100 +++++++++++---------------------- 2 files changed, 35 insertions(+), 66 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 93953d1922c4..f92d1b37b877 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -253,6 +253,7 @@ struct protection_domain { */ struct iommu_dev_data { struct list_head list; /* For domain->dev_list */ + struct device *dev; /* Device this data belong to */ struct device *alias; /* The Alias Device */ struct protection_domain *domain; /* Domain the device is bound to */ atomic_t bind; /* Domain attach reverent count */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 41c4ebecced4..0eafca58926f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -146,6 +146,8 @@ static int iommu_init_device(struct device *dev) if (!dev_data) return -ENOMEM; + dev_data->dev = dev; + devid = get_device_id(dev); alias = amd_iommu_alias_table[devid]; pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); @@ -478,31 +480,21 @@ static void iommu_flush_complete(struct protection_domain *domain) /* * Command send function for invalidating a device table entry */ -static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) -{ - struct iommu_cmd cmd; - int ret; - - BUG_ON(iommu == NULL); - - memset(&cmd, 0, sizeof(cmd)); - CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); - cmd.data[0] = devid; - - ret = iommu_queue_command(iommu, &cmd); - - return ret; -} - static int iommu_flush_device(struct device *dev) { struct amd_iommu *iommu; + struct iommu_cmd cmd; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - return iommu_queue_inv_dev_entry(iommu, devid); + /* Build command */ + memset(&cmd, 0, sizeof(cmd)); + CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); + cmd.data[0] = devid; + + return iommu_queue_command(iommu, &cmd); } static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, @@ -592,30 +584,43 @@ static void iommu_flush_tlb_pde(struct protection_domain *domain) __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } + /* - * This function flushes all domains that have devices on the given IOMMU + * This function flushes the DTEs for all devices in domain */ -static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +static void iommu_flush_domain_devices(struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); + + list_for_each_entry(dev_data, &domain->dev_list, list) + iommu_flush_device(dev_data->dev); + + spin_unlock_irqrestore(&domain->lock, flags); +} + +static void iommu_flush_all_domain_devices(void) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; struct protection_domain *domain; unsigned long flags; spin_lock_irqsave(&amd_iommu_pd_lock, flags); list_for_each_entry(domain, &amd_iommu_pd_list, list) { - if (domain->dev_iommu[iommu->index] == 0) - continue; - - spin_lock(&domain->lock); - iommu_queue_inv_iommu_pages(iommu, address, domain->id, 1, 1); + iommu_flush_domain_devices(domain); iommu_flush_complete(domain); - spin_unlock(&domain->lock); } spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } +void amd_iommu_flush_all_devices(void) +{ + iommu_flush_all_domain_devices(); +} + /* * This function uses heavy locking and may disable irqs for some time. But * this is no issue because it is only called during resume. @@ -637,38 +642,6 @@ void amd_iommu_flush_all_domains(void) spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -static void flush_all_devices_for_iommu(struct amd_iommu *iommu) -{ - int i; - - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (iommu != amd_iommu_rlookup_table[i]) - continue; - - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); - } -} - -static void flush_devices_by_domain(struct protection_domain *domain) -{ - struct amd_iommu *iommu; - int i; - - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || - (amd_iommu_pd_table[i] != domain)) - continue; - - iommu = amd_iommu_rlookup_table[i]; - if (!iommu) - continue; - - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); - } -} - static void reset_iommu_command_buffer(struct amd_iommu *iommu) { pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); @@ -679,17 +652,12 @@ static void reset_iommu_command_buffer(struct amd_iommu *iommu) iommu->reset_in_progress = true; amd_iommu_reset_cmd_buffer(iommu); - flush_all_devices_for_iommu(iommu); - flush_all_domains_on_iommu(iommu); + amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); iommu->reset_in_progress = false; } -void amd_iommu_flush_all_devices(void) -{ - flush_devices_by_domain(NULL); -} - /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -1692,7 +1660,7 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - flush_devices_by_domain(domain); + iommu_flush_domain_devices(domain); iommu_flush_tlb_pde(domain); domain->updated = false; -- cgit v1.2.2 From 8eed9833346781dd15e3bef35a91b0a40787ea3c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 15:45:41 +0100 Subject: x86/amd-iommu: Move reset_iommu_command_buffer out of locked code This patch removes the ugly contruct where the iommu->lock must be released while before calling the reset_iommu_command_buffer function. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0eafca58926f..b75fcd9b6a0f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -285,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + iommu->reset_in_progress = true; reset_iommu_command_buffer(iommu); dump_command(address); break; @@ -407,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) { - spin_unlock(&iommu->lock); - reset_iommu_command_buffer(iommu); - spin_lock(&iommu->lock); - } + if (unlikely(i == EXIT_LOOP_COUNT)) + iommu->reset_in_progress = true; } /* @@ -458,6 +456,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu) out: spin_unlock_irqrestore(&iommu->lock, flags); + if (iommu->reset_in_progress) + reset_iommu_command_buffer(iommu); + return 0; } @@ -649,8 +650,6 @@ static void reset_iommu_command_buffer(struct amd_iommu *iommu) if (iommu->reset_in_progress) panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); - iommu->reset_in_progress = true; - amd_iommu_reset_cmd_buffer(iommu); amd_iommu_flush_all_devices(); amd_iommu_flush_all_domains(); -- cgit v1.2.2 From 492667dacc0ac9763969155482b1261b34ccf450 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 27 Nov 2009 13:25:47 +0100 Subject: x86/amd-iommu: Remove amd_iommu_pd_table The data that was stored in this table is now available in dev->archdata.iommu. So this table is not longer necessary. This patch removes the remaining uses of that variable and removes it from the code. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 3 --- arch/x86/kernel/amd_iommu.c | 35 +++++++++++----------------------- arch/x86/kernel/amd_iommu_init.c | 18 ----------------- 3 files changed, 11 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index f92d1b37b877..ba19ad4c47d0 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -457,9 +457,6 @@ extern unsigned amd_iommu_aperture_order; /* largest PCI device id we expect translation requests for */ extern u16 amd_iommu_last_bdf; -/* data structures for protection domain handling */ -extern struct protection_domain **amd_iommu_pd_table; - /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b75fcd9b6a0f..32fb09102a13 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1309,8 +1309,6 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) { u64 pte_root = virt_to_phys(domain->pt_root); - BUG_ON(amd_iommu_pd_table[devid] != NULL); - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -1318,20 +1316,10 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); - - amd_iommu_pd_table[devid] = domain; - } static void clear_dte_entry(u16 devid) { - struct protection_domain *domain = amd_iommu_pd_table[devid]; - - BUG_ON(domain == NULL); - - /* remove domain from the lookup table */ - amd_iommu_pd_table[devid] = NULL; - /* remove entry from the device table seen by the hardware */ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; amd_iommu_dev_table[devid].data[1] = 0; @@ -1641,15 +1629,11 @@ static struct protection_domain *get_domain(struct device *dev) static void update_device_table(struct protection_domain *domain) { - unsigned long flags; - int i; + struct iommu_dev_data *dev_data; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] != domain) - continue; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - set_dte_entry(i, domain); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) { + u16 devid = get_device_id(dev_data->dev); + set_dte_entry(devid, domain); } } @@ -2259,14 +2243,17 @@ free_domains: static void cleanup_domain(struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *next; unsigned long flags; - u16 devid; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) - if (amd_iommu_pd_table[devid] == domain) - clear_dte_entry(devid); + list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { + struct device *dev = dev_data->dev; + + do_detach(dev); + atomic_set(&dev_data->bind, 0); + } write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fe1686f6f91b..7ffc39965233 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -164,12 +164,6 @@ u16 *amd_iommu_alias_table; */ struct amd_iommu **amd_iommu_rlookup_table; -/* - * The pd table (protection domain table) is used to find the protection domain - * data structure a device belongs to. Indexed with the PCI device id too. - */ -struct protection_domain **amd_iommu_pd_table; - /* * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap * to know which ones are already in use. @@ -1238,15 +1232,6 @@ static int __init amd_iommu_init(void) if (amd_iommu_rlookup_table == NULL) goto free; - /* - * Protection Domain table - maps devices to protection domains - * This table has the same size as the rlookup_table - */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(rlookup_table_size)); - if (amd_iommu_pd_table == NULL) - goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); @@ -1314,9 +1299,6 @@ free: free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); - free_pages((unsigned long)amd_iommu_rlookup_table, get_order(rlookup_table_size)); -- cgit v1.2.2 From 18ed61da985c57eea3fe8038b13fa2837c9b3c3f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 27 Nov 2009 15:24:44 +0100 Subject: x86: hpet: Make WARN_ON understandable Andrew complained rightly that the WARN_ON in hpet_next_event() is confusing and the code comment not really helpful. Change it to WARN_ONCE and print the reason in clear text. Change the comment to explain what kind of hardware wreckage we deal with. Pointed-out-by: Andrew Morton Signed-off-by: Thomas Gleixner Cc: Venki Pallipadi --- arch/x86/kernel/hpet.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7f024ff47d1d..ba6e65884603 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -384,11 +384,22 @@ static int hpet_next_event(unsigned long delta, hpet_writel(cnt, HPET_Tn_CMP(timer)); /* - * We need to read back the CMP register to make sure that - * what we wrote hit the chip before we compare it to the - * counter. + * We need to read back the CMP register on certain HPET + * implementations (ATI chipsets) which seem to delay the + * transfer of the compare register into the internal compare + * logic. With small deltas this might actually be too late as + * the counter could already be higher than the compare value + * at that point and we would wait for the next hpet interrupt + * forever. We found out that reading the CMP register back + * forces the transfer so we can rely on the comparison with + * the counter register below. If the read back from the + * compare register does not match the value we programmed + * then we might have a real hardware problem. We can not do + * much about it here, but at least alert the user/admin with + * a prominent warning. */ - WARN_ON_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt); + WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, + KERN_WARNING "hpet: compare register read back failed.\n"); return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } -- cgit v1.2.2 From b8b7d791a8ff01d2380089279a69afa99115fb23 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 28 Nov 2009 15:03:03 +0100 Subject: x86: Use -maccumulate-outgoing-args for sane mcount prologues commit 746357d (x86: Prevent GCC 4.4.x (pentium-mmx et al) function prologue wreckage) uses -mtune=generic to work around the function prologue problem with mcount on -march=pentium-mmx and others. Jakub pointed out that we can use -maccumulate-outgoing-args instead which is selected by -mtune=generic and prevents the problem without losing the -march specific optimizations. Pointed-out-by: Jakub Jelinek Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: stable@kernel.org --- arch/x86/Makefile_32.cpu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index df7fdf811997..1937226fd502 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -49,8 +49,9 @@ cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) # Work around the pentium-mmx code generator madness of gcc4.4.x which # does stack alignment by generating horrible code _before_ the mcount # prologue (push %ebp, mov %esp, %ebp) which breaks the function graph -# tracer assumptions -cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-mtune=generic) +# tracer assumptions. For i686, generic, core2 this is set by the +# compiler anyway +cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args) # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. -- cgit v1.2.2 From 9eaa192d8988d621217a9e6071cd403fd6010496 Mon Sep 17 00:00:00 2001 From: "Helight.Xu" Date: Mon, 30 Nov 2009 18:33:51 +0800 Subject: x86: Fix a section mismatch in arch/x86/kernel/setup.c copy_edd() should be __init. warning msg: WARNING: vmlinux.o(.text+0x7759): Section mismatch in reference from the function copy_edd() to the variable .init.data:boot_params The function copy_edd() references the variable __initdata boot_params. This is often because copy_edd lacks a __initdata annotation or the annotation of boot_params is wrong. Signed-off-by: ZhenwenXu LKML-Reference: <4B139F8F.4000907@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e09f0e2c14b5..e020b2d03b4f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -247,7 +247,7 @@ EXPORT_SYMBOL(edd); * from boot_params into a safe place. * */ -static inline void copy_edd(void) +static inline void __init copy_edd(void) { memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, sizeof(edd.mbr_signature)); @@ -256,7 +256,7 @@ static inline void copy_edd(void) edd.edd_info_nr = boot_params.eddbuf_entries; } #else -static inline void copy_edd(void) +static inline void __init copy_edd(void) { } #endif -- cgit v1.2.2 From ccef086454d4c97e7b722e9303390207d681cb4c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 30 Nov 2009 21:33:51 -0800 Subject: x86, mm: Correct the implementation of is_untracked_pat_range() The semantics the PAT code expect of is_untracked_pat_range() is "is this range completely contained inside the untracked region." This means that checkin 8a27138924f64d2f30c1022f909f74480046bc3f was technically wrong, because the implementation needlessly confusing. The sane interface is for it to take a semiclosed range like just about everything else (as evidenced by the sheer number of "- 1"'s removed by that patch) so change the actual implementation to match. Reported-by: Suresh Siddha Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Jack Steiner Signed-off-by: H. Peter Anvin LKML-Reference: <20091119202341.GA4420@sgi.com> --- arch/x86/include/asm/e820.h | 6 +++++- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 68b4e0ec1950..761249e396fe 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -133,9 +133,13 @@ extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); +/* + * Returns true iff the specified range [s,e) is completely contained inside + * the ISA region. + */ static inline bool is_ISA_range(u64 s, u64 e) { - return s >= ISA_START_ADDRESS && e < ISA_END_ADDRESS; + return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS; } #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 597a47b1cec6..1e09417c992f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -39,7 +39,7 @@ static u64 gru_start_paddr, gru_end_paddr; static inline bool is_GRU_range(u64 start, u64 end) { - return start >= gru_start_paddr && end < gru_end_paddr; + return start >= gru_start_paddr && end <= gru_end_paddr; } static bool uv_is_untracked_pat_range(u64 start, u64 end) -- cgit v1.2.2 From 1cedae72904b85462082dbcfd5190309ba37f8bd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2009 07:32:16 +0100 Subject: hw-breakpoints: Keep track of user disabled breakpoints When we disable a breakpoint through dr7, we unregister it right away, making us lose track of its corresponding address register value. It means that the following sequence would be unsupported: - set address in dr0 - enable it through dr7 - disable it through dr7 - enable it through dr7 because we lost the address register value when we disabled the breakpoint. Don't unregister the disabled breakpoints but rather disable them. Reported-by: "K.Prasad" Signed-off-by: Frederic Weisbecker LKML-Reference: <1259735536-9236-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2941b32ea666..04d182a7cfdb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -595,7 +595,7 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) static struct perf_event * ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk) + struct task_struct *tsk, int disabled) { int err; int gen_len, gen_type; @@ -616,7 +616,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, attr = bp->attr; attr.bp_len = gen_len; attr.bp_type = gen_type; - attr.disabled = 0; + attr.disabled = disabled; return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } @@ -655,13 +655,21 @@ restore: */ if (!second_pass) continue; + thread->ptrace_bps[i] = NULL; - unregister_hw_breakpoint(bp); + bp = ptrace_modify_breakpoint(bp, len, type, + tsk, 1); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; } continue; } - bp = ptrace_modify_breakpoint(bp, len, type, tsk); + bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { -- cgit v1.2.2 From ca64c47cecd0321b2e0dcbd7aaff44b68ce20654 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 1 Dec 2009 15:31:15 -0800 Subject: x86, io-apic: Move the effort of clearing remoteIRR explicitly before migrating the irq When the level-triggered interrupt is seen as an edge interrupt, we try to clear the remoteIRR explicitly (using either an io-apic eoi register when present or through the idea of changing trigger mode of the io-apic RTE to edge and then back to level). But this explicit try also needs to happen before we try to migrate the irq. Otherwise irq migration attempt will fail anyhow, as it postpones the irq migration to a later attempt when it sees the remoteIRR in the io-apic RTE still set. Signed-off-by: "Maciej W. Rozycki" Reviewed-by: Suresh Siddha Cc: ebiederm@xmission.com Cc: garyhade@us.ibm.com LKML-Reference: <20091201233334.975416130@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 085e60e303cf..b377b973899e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2583,6 +2583,20 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); + /* Tail end of version 0x11 I/O APIC bug workaround */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + if (use_eoi_reg) + eoi_ioapic_irq(desc); + else { + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(cfg); + __unmask_and_level_IO_APIC_irq(cfg); + spin_unlock(&ioapic_lock); + } + } + /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2616,20 +2630,6 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } - - /* Tail end of version 0x11 I/O APIC bug workaround */ - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - - if (use_eoi_reg) - eoi_ioapic_irq(desc); - else { - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } - } } #ifdef CONFIG_INTR_REMAP -- cgit v1.2.2 From c29d9db338db606c3335a03f337e1d4b7f6bb727 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 1 Dec 2009 15:31:16 -0800 Subject: x86, ioapic: Fix the EOI register detection mechanism Maciej W. Rozycki reported: > 82093AA I/O APIC has its version set to 0x11 and it > does not support the EOI register. Similarly I/O APICs > integrated into the 82379AB south bridge and the 82374EB/SB > EISA component. IO-APIC versions below 0x20 don't support EOI register. Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic version as 0x2. This is an error with documentation and these ICH chips use io-apic's of version 0x20 and indeed has a working EOI register for the io-apic. Fix the EOI register detection mechanism to check for version 0x20 and beyond. And also, a platform can potentially have io-apic's with different versions. Make the EOI register check per io-apic. Reported-by: Maciej W. Rozycki Signed-off-by: Suresh Siddha Cc: ebiederm@xmission.com Cc: garyhade@us.ibm.com LKML-Reference: <20091201233335.065361533@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 115 ++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b377b973899e..78960a3b0ed0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -539,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, add_pin_to_irq_node(cfg, node, newapic, newpin); } +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + static void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { - int pin; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); - } + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); +} + +static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) @@ -579,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED, NULL); -} - -static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER, NULL); -} - static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { struct irq_cfg *cfg = desc->chip_data; @@ -2492,17 +2498,42 @@ static void ack_apic_edge(unsigned int irq) atomic_t irq_mis_count; -static int use_eoi_reg __read_mostly; - +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. +*/ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { struct irq_pin_list *entry; for_each_irq_pin(entry, cfg->irq_2_pin) { - if (irq_remapped(irq)) - io_apic_eoi(entry->apic, entry->pin); - else - io_apic_eoi(entry->apic, cfg->vector); + if (mp_ioapics[entry->apic].apicver >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } else { + __mask_and_edge_IO_APIC_irq(entry); + __unmask_and_level_IO_APIC_irq(entry); + } } } @@ -2520,23 +2551,6 @@ static void eoi_ioapic_irq(struct irq_desc *desc) spin_unlock_irqrestore(&ioapic_lock, flags); } -static int ioapic_supports_eoi(void) -{ - struct pci_dev *root; - - root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0)); - if (root && root->vendor == PCI_VENDOR_ID_INTEL && - mp_ioapics[0].apicver >= 0x2) { - use_eoi_reg = 1; - printk(KERN_INFO "IO-APIC supports EOI register\n"); - } else - printk(KERN_INFO "IO-APIC doesn't support EOI\n"); - - return 0; -} - -fs_initcall(ioapic_supports_eoi); - static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2587,14 +2601,7 @@ static void ack_apic_level(unsigned int irq) if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - if (use_eoi_reg) - eoi_ioapic_irq(desc); - else { - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } + eoi_ioapic_irq(desc); } /* Now we can move and renable the irq */ -- cgit v1.2.2 From 1c83995b6c7c6bb795bce80f75fbffb15f78db2d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 1 Dec 2009 15:31:17 -0800 Subject: x86, ioapic: Document another case when level irq is seen as an edge In the case when cpu goes offline, fixup_irqs() will forward any unhandled interrupt on the offlined cpu to the new cpu destination that is handling the corresponding interrupt. This interrupt forwarding is done via IPI's. Hence, in this case also level-triggered io-apic interrupt will be seen as an edge interrupt in the cpu's APIC IRR. Document this scenario in the code which handles this case by doing an explicit EOI to the io-apic to clear remote IRR of the io-apic RTE. Requested-by: Maciej W. Rozycki Signed-off-by: Suresh Siddha Cc: Maciej W. Rozycki Cc: ebiederm@xmission.com Cc: garyhade@us.ibm.com LKML-Reference: <20091201233335.143970505@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 78960a3b0ed0..c0b4468683f9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2586,6 +2586,19 @@ static void ack_apic_level(unsigned int irq) * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. */ cfg = desc->chip_data; i = cfg->vector; @@ -2597,7 +2610,13 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); - /* Tail end of version 0x11 I/O APIC bug workaround */ + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); -- cgit v1.2.2 From 6d20792e85187b27ae3d1b76678a2dd7025e8bc2 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 1 Dec 2009 15:31:18 -0800 Subject: x86: Remove unnecessary mdelay() from cpu_disable_common() fixup_irqs() already has a mdelay(). Remove the extra and unnecessary mdelay() from cpu_disable_common(). Signed-off-by: Suresh Siddha Cc: Maciej W. Rozycki Cc: ebiederm@xmission.com Cc: garyhade@us.ibm.com LKML-Reference: <20091201233335.232177348@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 565ebc65920e..324f2a44c221 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1250,16 +1250,7 @@ static void __ref remove_cpu_from_maps(int cpu) void cpu_disable_common(void) { int cpu = smp_processor_id(); - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ -- cgit v1.2.2 From e859cf8656043f158b4004ccc8cbbf1ba4f97177 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 30 Nov 2009 19:02:22 -0500 Subject: x86: Fix comments of register/stack access functions Fix typos and some redundant comments of register/stack access functions in asm/ptrace.h. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Frederic Weisbecker Cc: Roland McGrath Cc: Oleg Nesterov Cc: Wenji Huang Cc: Mahesh J Salgaonkar LKML-Reference: <20091201000222.7669.7477.stgit@harusame> Signed-off-by: Ingo Molnar Suggested-by: Wenji Huang --- arch/x86/include/asm/ptrace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index a3d49dd7d26e..3d11fd0f44c5 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -227,8 +227,8 @@ extern const char *regs_query_register_name(unsigned int offset); * @regs: pt_regs from which register value is gotten. * @offset: offset number of the register. * - * regs_get_register returns the value of a register whose offset from @regs - * is @offset. The @offset is the offset of the register in struct pt_regs. + * regs_get_register returns the value of a register. The @offset is the + * offset of the register in struct pt_regs address which specified by @regs. * If @offset is bigger than MAX_REG_OFFSET, this returns 0. */ static inline unsigned long regs_get_register(struct pt_regs *regs, @@ -244,7 +244,7 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, * @regs: pt_regs which contains kernel stack pointer. * @addr: address which is checked. * - * regs_within_kenel_stack() checks @addr is within the kernel stack page(s). + * regs_within_kernel_stack() checks @addr is within the kernel stack page(s). * If @addr is within the kernel stack, it returns true. If not, returns false. */ static inline int regs_within_kernel_stack(struct pt_regs *regs, @@ -260,7 +260,7 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs, * @n: stack entry number. * * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which - * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * is specified by @regs. If the @n th entry is NOT in the kernel stack, * this returns 0. */ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, -- cgit v1.2.2 From 01be50a308be466e122c3a8b3d535f1b673ecbd2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 27 Nov 2009 15:04:58 +0000 Subject: x86/alternatives: Check replacementlen <= instrlen at build time Having run into the run-(boot-)time check a couple of times lately, I finally took time to find a build-time check so that one doesn't need to analyze the register/stack dump and resolve this (through manual lookup in vmlinux) to the offending construct. The assembler will emit a message like "Error: value of too large for field of 1 bytes at ", which while not pointing out the source location still makes analysis quite a bit easier. Signed-off-by: Jan Beulich LKML-Reference: <4B0FF8AA0200007800022703@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index c240efc74e00..69b74a7b877f 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -84,6 +84,7 @@ static inline void alternatives_smp_switch(int smp) {} " .byte " __stringify(feature) "\n" /* feature bit */ \ " .byte 662b-661b\n" /* sourcelen */ \ " .byte 664f-663f\n" /* replacementlen */ \ + " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ ".previous\n" \ ".section .altinstr_replacement, \"ax\"\n" \ "663:\n\t" newinstr "\n664:\n" /* replacement */ \ -- cgit v1.2.2 From 99063c0bcebcc913165a5d168050326eba3e0996 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 27 Nov 2009 15:06:16 +0000 Subject: x86/alternatives: No need for alternatives-asm.h to re-invent stuff already in asm.h This at once also gets the alignment specification right for x86-64. Signed-off-by: Jan Beulich LKML-Reference: <4B0FF8F80200007800022708@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative-asm.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e2077d343c33..b97f786a48d5 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -1,17 +1,13 @@ #ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_32 -# define X86_ALIGN .long -#else -# define X86_ALIGN .quad -#endif +#include #ifdef CONFIG_SMP .macro LOCK_PREFIX 1: lock .section .smp_locks,"a" - .align 4 - X86_ALIGN 1b + _ASM_ALIGN + _ASM_PTR 1b .previous .endm #else -- cgit v1.2.2 From fe5ed91ddce85a0ed0e4f92c10b099873ef62167 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 3 Dec 2009 11:33:08 +0900 Subject: x86, mce: don't restart timer if disabled Even it is in error path unlikely taken, add_timer_on() at CPU_DOWN_FAILED* needs to be skipped if mce_timer is disabled. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Cc: Huang Ying Cc: Jan Beulich Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 721a77ca8115..4825a3d6eb40 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1991,9 +1991,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + + if (!mce_ignore_ce && check_interval) { + t->expires = round_jiffies(jiffies + __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); + add_timer_on(t, cpu); + } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; case CPU_POST_DEAD: -- cgit v1.2.2 From 0934ac9d135021bec7f877340a039104af233bf3 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 23 Aug 2009 14:24:24 +0300 Subject: KVM: x86 emulator: Add 'push/pop sreg' instructions [avi: avoid buffer overflow] Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1be5cd640e93..1cdfec5231d0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -92,19 +92,22 @@ static u32 opcode_table[256] = { /* 0x00 - 0x07 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack, ImplicitOps | Stack, /* 0x08 - 0x0F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + 0, 0, ImplicitOps | Stack, 0, /* 0x10 - 0x17 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack, ImplicitOps | Stack, /* 0x18 - 0x1F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack, ImplicitOps | Stack, /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -244,11 +247,13 @@ static u32 twobyte_table[256] = { /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + ImplicitOps | Stack, ImplicitOps | Stack, + 0, DstMem | SrcReg | ModRM | BitOp, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + ImplicitOps | Stack, ImplicitOps | Stack, + 0, DstMem | SrcReg | ModRM | BitOp, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, @@ -1186,6 +1191,32 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, return rc; } +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment segment; + + kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); + + c->src.val = segment.selector; + emulate_push(ctxt); +} + +static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + unsigned long selector; + int rc; + + rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); + if (rc != 0) + return rc; + + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); + return rc; +} + static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1707,18 +1738,66 @@ special_insn: add: /* add */ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; + case 0x06: /* push es */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + goto cannot_emulate; + + emulate_push_sreg(ctxt, VCPU_SREG_ES); + break; + case 0x07: /* pop es */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + goto cannot_emulate; + + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); + if (rc != 0) + goto done; + break; case 0x08 ... 0x0d: or: /* or */ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); break; + case 0x0e: /* push cs */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + goto cannot_emulate; + + emulate_push_sreg(ctxt, VCPU_SREG_CS); + break; case 0x10 ... 0x15: adc: /* adc */ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); break; + case 0x16: /* push ss */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + goto cannot_emulate; + + emulate_push_sreg(ctxt, VCPU_SREG_SS); + break; + case 0x17: /* pop ss */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + goto cannot_emulate; + + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); + if (rc != 0) + goto done; + break; case 0x18 ... 0x1d: sbb: /* sbb */ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; + case 0x1e: /* push ds */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + goto cannot_emulate; + + emulate_push_sreg(ctxt, VCPU_SREG_DS); + break; + case 0x1f: /* pop ds */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + goto cannot_emulate; + + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); + if (rc != 0) + goto done; + break; case 0x20 ... 0x25: and: /* and */ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); @@ -2297,6 +2376,14 @@ twobyte_insn: jmp_rel(c, c->src.val); c->dst.type = OP_NONE; break; + case 0xa0: /* push fs */ + emulate_push_sreg(ctxt, VCPU_SREG_FS); + break; + case 0xa1: /* pop fs */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); + if (rc != 0) + goto done; + break; case 0xa3: bt: /* bt */ c->dst.type = OP_NONE; @@ -2308,6 +2395,14 @@ twobyte_insn: case 0xa5: /* shld cl, r, r/m */ emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; + case 0xa8: /* push gs */ + emulate_push_sreg(ctxt, VCPU_SREG_GS); + break; + case 0xa9: /* pop gs */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); + if (rc != 0) + goto done; + break; case 0xab: bts: /* bts */ /* only subword offset */ -- cgit v1.2.2 From d8769fedd4e8323d8afea9a1b2bdebff4f1d2d37 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 23 Aug 2009 14:24:25 +0300 Subject: KVM: x86 emulator: Introduce No64 decode option Introduces a new decode option "No64", which is used for instructions that are invalid in long mode. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1cdfec5231d0..1f0ff4afa73e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -75,6 +75,8 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +/* Misc flags */ +#define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) #define Src2CL (1<<29) @@ -93,21 +95,21 @@ static u32 opcode_table[256] = { ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack, ImplicitOps | Stack, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, ImplicitOps | Stack, 0, + 0, 0, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack, ImplicitOps | Stack, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack, ImplicitOps | Stack, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -161,7 +163,7 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, SrcImm | Src2Imm16, 0, + 0, 0, SrcImm | Src2Imm16 | No64, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, @@ -188,7 +190,7 @@ static u32 opcode_table[256] = { ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ 0, 0, 0, ImplicitOps | Stack, - ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, + ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps, /* 0xD0 - 0xD7 */ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, @@ -201,7 +203,7 @@ static u32 opcode_table[256] = { ByteOp | SrcImmUByte, SrcImmUByte, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, + SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ @@ -967,6 +969,11 @@ done_prefixes: } } + if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { + kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; + return -1; + } + if (c->d & Group) { group = c->d & GroupMask; c->modrm = insn_fetch(u8, 1, c->eip); @@ -1739,15 +1746,9 @@ special_insn: emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; case 0x06: /* push es */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - goto cannot_emulate; - emulate_push_sreg(ctxt, VCPU_SREG_ES); break; case 0x07: /* pop es */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - goto cannot_emulate; - rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); if (rc != 0) goto done; @@ -1757,9 +1758,6 @@ special_insn: emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); break; case 0x0e: /* push cs */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - goto cannot_emulate; - emulate_push_sreg(ctxt, VCPU_SREG_CS); break; case 0x10 ... 0x15: @@ -1767,15 +1765,9 @@ special_insn: emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); break; case 0x16: /* push ss */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - goto cannot_emulate; - emulate_push_sreg(ctxt, VCPU_SREG_SS); break; case 0x17: /* pop ss */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - goto cannot_emulate; - rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); if (rc != 0) goto done; @@ -1785,15 +1777,9 @@ special_insn: emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; case 0x1e: /* push ds */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - goto cannot_emulate; - emulate_push_sreg(ctxt, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - goto cannot_emulate; - rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); if (rc != 0) goto done; -- cgit v1.2.2 From 851ba6922ac575b749f63dee0ae072808163ba6a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 24 Aug 2009 11:10:17 +0300 Subject: KVM: Don't pass kvm_run arguments They're just copies of vcpu->run, which is readily accessible. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 10 ++-- arch/x86/kvm/emulate.c | 6 +-- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 102 +++++++++++++++++++----------------- arch/x86/kvm/vmx.c | 113 +++++++++++++++++++--------------------- arch/x86/kvm/x86.c | 50 +++++++++--------- 6 files changed, 141 insertions(+), 142 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d83892226f73..0b113f2b58cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -506,8 +506,8 @@ struct kvm_x86_ops { void (*tlb_flush)(struct kvm_vcpu *vcpu); - void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); - int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); + void (*run)(struct kvm_vcpu *vcpu); + int (*handle_exit)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); @@ -568,7 +568,7 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) -int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, +int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, int emulation_type); void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); @@ -585,9 +585,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); struct x86_emulate_ctxt; -int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, int size, unsigned long count, int down, gva_t address, int rep, unsigned port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1f0ff4afa73e..0644d3df621a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1826,7 +1826,7 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + if (kvm_emulate_pio_string(ctxt->vcpu, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1842,7 +1842,7 @@ special_insn: return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + if (kvm_emulate_pio_string(ctxt->vcpu, 0, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -2135,7 +2135,7 @@ special_insn: case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 818b92ad82cf..a9024797b21f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2789,7 +2789,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) if (r) goto out; - er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); + er = emulate_instruction(vcpu, cr2, error_code, 0); switch (er) { case EMULATE_DONE: diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c17404add91f..92048a626d4e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -286,7 +286,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!svm->next_rip) { - if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != + if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != EMULATE_DONE) printk(KERN_DEBUG "%s: NOP\n", __func__); return; @@ -1180,7 +1180,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, } } -static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int pf_interception(struct vcpu_svm *svm) { u64 fault_address; u32 error_code; @@ -1194,8 +1194,10 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } -static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int db_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->vcpu.arch.singlestep) { @@ -1223,25 +1225,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int bp_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = BP_VECTOR; return 0; } -static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int ud_interception(struct vcpu_svm *svm) { int er; - er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } -static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int nm_interception(struct vcpu_svm *svm) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) @@ -1251,7 +1255,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int mc_interception(struct vcpu_svm *svm) { /* * On an #MC intercept the MCE handler is not called automatically in @@ -1264,8 +1268,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int shutdown_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + /* * VMCB is undefined after a SHUTDOWN intercept * so reinitialize it. @@ -1277,7 +1283,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 0; } -static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int io_interception(struct vcpu_svm *svm) { u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, in, string; @@ -1291,7 +1297,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (string) { if (emulate_instruction(&svm->vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + 0, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -1301,33 +1307,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; skip_emulated_instruction(&svm->vcpu); - return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); + return kvm_emulate_pio(&svm->vcpu, in, size, port); } -static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int nmi_interception(struct vcpu_svm *svm) { return 1; } -static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int intr_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.irq_exits; return 1; } -static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int nop_on_interception(struct vcpu_svm *svm) { return 1; } -static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int halt_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu); } -static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmmcall_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); @@ -1837,7 +1843,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; } -static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; @@ -1857,7 +1863,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; @@ -1877,7 +1883,7 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmrun_interception(struct vcpu_svm *svm) { nsvm_printk("VMrun\n"); @@ -1907,7 +1913,7 @@ failed: return 1; } -static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int stgi_interception(struct vcpu_svm *svm) { if (nested_svm_check_permissions(svm)) return 1; @@ -1920,7 +1926,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int clgi_interception(struct vcpu_svm *svm) { if (nested_svm_check_permissions(svm)) return 1; @@ -1937,7 +1943,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; nsvm_printk("INVLPGA\n"); @@ -1950,15 +1956,13 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int invalid_op_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int invalid_op_interception(struct vcpu_svm *svm) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } -static int task_switch_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; int reason; @@ -2008,14 +2012,14 @@ static int task_switch_interception(struct vcpu_svm *svm, return kvm_task_switch(&svm->vcpu, tss_selector, reason); } -static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int cpuid_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; kvm_emulate_cpuid(&svm->vcpu); return 1; } -static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); @@ -2023,26 +2027,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int invlpg_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) + if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); return 1; } -static int emulate_on_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int emulate_on_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) + if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); return 1; } -static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int cr8_write_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ - emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); + emulate_instruction(&svm->vcpu, 0, 0, 0); if (irqchip_in_kernel(svm->vcpu.kvm)) { svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; return 1; @@ -2128,7 +2133,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) return 0; } -static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int rdmsr_interception(struct vcpu_svm *svm) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; @@ -2221,7 +2226,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) return 0; } -static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int wrmsr_interception(struct vcpu_svm *svm) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) @@ -2237,17 +2242,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int msr_interception(struct vcpu_svm *svm) { if (svm->vmcb->control.exit_info_1) - return wrmsr_interception(svm, kvm_run); + return wrmsr_interception(svm); else - return rdmsr_interception(svm, kvm_run); + return rdmsr_interception(svm); } -static int interrupt_window_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int interrupt_window_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -2265,8 +2271,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm, return 1; } -static int (*svm_exit_handlers[])(struct vcpu_svm *svm, - struct kvm_run *kvm_run) = { +static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = emulate_on_interception, [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, @@ -2321,9 +2326,10 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_NPF] = pf_interception, }; -static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; trace_kvm_exit(exit_code, svm->vmcb->save.rip); @@ -2383,7 +2389,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } - return svm_exit_handlers[exit_code](svm, kvm_run); + return svm_exit_handlers[exit_code](svm); } static void reload_tss(struct kvm_vcpu *vcpu) @@ -2588,7 +2594,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) #define R "e" #endif -static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u16 fs_selector; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ed53b42caba1..4635298d000a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2659,7 +2659,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) return 1; /* * Forward all other exceptions that are valid in real mode. @@ -2710,15 +2710,16 @@ static void kvm_machine_check(void) #endif } -static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_machine_check(struct kvm_vcpu *vcpu) { /* already handled by vcpu_run */ return 1; } -static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; unsigned long cr2, rip, dr6; u32 vect_info; @@ -2728,7 +2729,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (is_machine_check(intr_info)) - return handle_machine_check(vcpu, kvm_run); + return handle_machine_check(vcpu); if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) @@ -2744,7 +2745,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -2803,20 +2804,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int handle_external_interrupt(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; } -static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_triple_fault(struct kvm_vcpu *vcpu) { - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; } -static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int size, in, string; @@ -2827,8 +2827,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) string = (exit_qualification & 16) != 0; if (string) { - if (emulate_instruction(vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -2838,7 +2837,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) port = exit_qualification >> 16; skip_emulated_instruction(vcpu); - return kvm_emulate_pio(vcpu, kvm_run, in, size, port); + return kvm_emulate_pio(vcpu, in, size, port); } static void @@ -2852,7 +2851,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; @@ -2887,7 +2886,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; if (cr8_prev <= cr8) return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } }; @@ -2922,13 +2921,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: break; } - kvm_run->exit_reason = 0; + vcpu->run->exit_reason = 0; pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } -static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; unsigned long val; @@ -2944,13 +2943,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - kvm_run->debug.arch.dr6 = vcpu->arch.dr6; - kvm_run->debug.arch.dr7 = dr; - kvm_run->debug.arch.pc = + vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr7 = dr; + vcpu->run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + vmcs_readl(GUEST_RIP); - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + vcpu->run->debug.arch.exception = DB_VECTOR; + vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { vcpu->arch.dr7 &= ~DR7_GD; @@ -3016,13 +3015,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_cpuid(struct kvm_vcpu *vcpu) { kvm_emulate_cpuid(vcpu); return 1; } -static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data; @@ -3041,7 +3040,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) @@ -3058,14 +3057,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { return 1; } -static int handle_interrupt_window(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_interrupt_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -3081,34 +3078,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * possible */ if (!irqchip_in_kernel(vcpu->kvm) && - kvm_run->request_interrupt_window && + vcpu->run->request_interrupt_window && !kvm_cpu_has_interrupt(vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } return 1; } -static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_halt(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); return kvm_emulate_halt(vcpu); } -static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_vmcall(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); kvm_emulate_hypercall(vcpu); return 1; } -static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_vmx_insn(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3117,14 +3114,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); /* TODO: Add support for VT-d/pass-through device */ return 1; } -static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_apic_access(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; enum emulation_result er; @@ -3133,7 +3130,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + er = emulate_instruction(vcpu, 0, 0, 0); if (er != EMULATE_DONE) { printk(KERN_ERR @@ -3144,7 +3141,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; @@ -3198,7 +3195,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; @@ -3219,8 +3216,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vmcs_readl(GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; return 0; } @@ -3290,7 +3287,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, } } -static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { u64 sptes[4]; int nr_sptes, i; @@ -3306,13 +3303,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; return 0; } -static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -3325,8 +3322,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static void handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; @@ -3335,7 +3331,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, preempt_enable(); while (!guest_state_valid(vcpu)) { - err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + err = emulate_instruction(vcpu, 0, 0, 0); if (err == EMULATE_DO_MMIO) break; @@ -3362,8 +3358,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) = { +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, @@ -3403,7 +3398,7 @@ static const int kvm_vmx_max_exit_handlers = * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; @@ -3425,8 +3420,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); if (unlikely(vmx->fail)) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); return 0; } @@ -3459,10 +3454,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); + return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_reason; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = exit_reason; } return 0; } @@ -3600,7 +3595,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) #define Q "l" #endif -static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3614,7 +3609,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Handle invalid guest state instead of entering VMX */ if (vmx->emulation_required && emulate_invalid_guest_state) { - handle_invalid_guest_state(vcpu, kvm_run); + handle_invalid_guest_state(vcpu); return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ae07d261527c..1687d12b122a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2757,13 +2757,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu) } int emulate_instruction(struct kvm_vcpu *vcpu, - struct kvm_run *run, unsigned long cr2, u16 error_code, int emulation_type) { int r, shadow_mask; struct decode_cache *c; + struct kvm_run *run = vcpu->run; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -2969,8 +2969,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu) return r; } -int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, - int size, unsigned port) +int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { unsigned long val; @@ -2999,7 +2998,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } EXPORT_SYMBOL_GPL(kvm_emulate_pio); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, int size, unsigned long count, int down, gva_t address, int rep, unsigned port) { @@ -3453,17 +3452,17 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); * * No need to exit to userspace if we already have an interrupt queued. */ -static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && - kvm_run->request_interrupt_window && + vcpu->run->request_interrupt_window && kvm_arch_interrupt_allowed(vcpu)); } -static void post_kvm_run_save(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static void post_kvm_run_save(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); @@ -3525,7 +3524,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void inject_pending_event(struct kvm_vcpu *vcpu) { /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { @@ -3561,11 +3560,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } -static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && - kvm_run->request_interrupt_window; + vcpu->run->request_interrupt_window; if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) @@ -3586,12 +3585,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests)) { - kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; + vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; r = 0; goto out; } @@ -3615,7 +3614,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } - inject_pending_event(vcpu, kvm_run); + inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ if (vcpu->arch.nmi_pending) @@ -3641,7 +3640,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } trace_kvm_entry(vcpu->vcpu_id); - kvm_x86_ops->run(vcpu, kvm_run); + kvm_x86_ops->run(vcpu); if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { set_debugreg(current->thread.debugreg0, 0); @@ -3682,13 +3681,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_lapic_sync_from_vapic(vcpu); - r = kvm_x86_ops->handle_exit(kvm_run, vcpu); + r = kvm_x86_ops->handle_exit(vcpu); out: return r; } -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; @@ -3708,7 +3707,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = 1; while (r > 0) { if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - r = vcpu_enter_guest(vcpu, kvm_run); + r = vcpu_enter_guest(vcpu); else { up_read(&vcpu->kvm->slots_lock); kvm_vcpu_block(vcpu); @@ -3736,14 +3735,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); - if (dm_request_for_irq_injection(vcpu, kvm_run)) { + if (dm_request_for_irq_injection(vcpu)) { r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.request_irq_exits; } if (signal_pending(current)) { r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } if (need_resched()) { @@ -3754,7 +3753,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } up_read(&vcpu->kvm->slots_lock); - post_kvm_run_save(vcpu, kvm_run); + post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -3794,8 +3793,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_needed = 0; down_read(&vcpu->kvm->slots_lock); - r = emulate_instruction(vcpu, kvm_run, - vcpu->arch.mmio_fault_cr2, 0, + r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, EMULTYPE_NO_DECODE); up_read(&vcpu->kvm->slots_lock); if (r == EMULATE_DO_MMIO) { @@ -3811,7 +3809,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); - r = __vcpu_run(vcpu, kvm_run); + r = __vcpu_run(vcpu); out: if (vcpu->sigset_active) -- cgit v1.2.2 From 79c727d4371aa9af47b0cdbcad53742b5a7919ea Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 24 Aug 2009 11:54:18 +0300 Subject: KVM: Call pic_clear_isr() on pic reset to reuse logic there Also move call of ack notifiers after pic state change. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 01f151682802..ccc941af4eaf 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -225,22 +225,11 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, irqbase, n; + int irq; struct kvm *kvm = s->pics_state->irq_request_opaque; struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; + u8 irr = s->irr, isr = s->imr; - if (s == &s->pics_state->pics[0]) - irqbase = 0; - else - irqbase = 8; - - for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { - if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) - if (s->irr & (1 << irq) || s->isr & (1 << irq)) { - n = irq + irqbase; - kvm_notify_acked_irq(kvm, SELECT_PIC(n), n); - } - } s->last_irr = 0; s->irr = 0; s->imr = 0; @@ -256,6 +245,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->rotate_on_auto_eoi = 0; s->special_fully_nested_mode = 0; s->init4 = 0; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { + if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) + if (irr & (1 << irq) || isr & (1 << irq)) { + pic_clear_isr(s, irq); + } + } } static void pic_ioport_write(void *opaque, u32 addr, u32 val) -- cgit v1.2.2 From 1a6e4a8c276e122dbeb6f9c610f29735e4236bfd Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 24 Aug 2009 11:54:19 +0300 Subject: KVM: Move irq sharing information to irqchip level This removes assumptions that max GSIs is smaller than number of pins. Sharing is tracked on pin level not GSI level. [avi: no PIC on ia64] Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/irq.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0b113f2b58cf..35d3236c9de4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -410,7 +410,6 @@ struct kvm_arch{ gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; - unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; u64 vm_init_tsc; }; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7d6058a2fd38..c025a2362aae 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -71,6 +71,7 @@ struct kvm_pic { int output; /* intr from master PIC */ struct kvm_io_device dev; void (*ack_notifier)(void *opaque, int irq); + unsigned long irq_states[16]; }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -- cgit v1.2.2 From 3e71f88bc90792a187703860cf22fbed7c12cbd9 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 24 Aug 2009 11:54:21 +0300 Subject: KVM: Maintain back mapping from irqchip/pin to gsi Maintain back mapping from irqchip/pin to gsi to speedup interrupt acknowledgment notifications. [avi: build fix on non-x86/ia64] Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 4a5fe914dc59..f02e87a5206f 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -79,6 +79,7 @@ struct kvm_ioapic_state { #define KVM_IRQCHIP_PIC_MASTER 0 #define KVM_IRQCHIP_PIC_SLAVE 1 #define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { -- cgit v1.2.2 From 136bdfeee7b5bc986fc94af3a40d7d13ea37bb95 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 24 Aug 2009 11:54:23 +0300 Subject: KVM: Move irq ack notifier list to arch independent code Mask irq notifier list is already there. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 35d3236c9de4..a46e2dd9aca8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -397,7 +397,6 @@ struct kvm_arch{ struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; - struct hlist_head irq_ack_notifier_list; int vapics_in_nmi_mode; unsigned int tss_addr; -- cgit v1.2.2 From eba0226bdfffe262e72b8360e4d0d12070e9a0f0 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 24 Aug 2009 11:54:25 +0300 Subject: KVM: Move IO APIC to its own lock The allows removal of irq_lock from the injection path. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 22 ++++++++++++++++------ arch/x86/kvm/lapic.c | 5 +---- arch/x86/kvm/x86.c | 10 ++-------- 3 files changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index ccc941af4eaf..d057c0cbd245 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) s->isr_ack |= (1 << irq); if (s != &s->pics_state->pics[0]) irq += 8; + /* + * We are dropping lock while calling ack notifiers since ack + * notifier callbacks for assigned devices call into PIC recursively. + * Other interrupt may be delivered to PIC while lock is dropped but + * it should be safe since PIC state is already updated at this stage. + */ + spin_unlock(&s->pics_state->lock); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); + spin_lock(&s->pics_state->lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) @@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) static inline void pic_intack(struct kvm_kpic_state *s, int irq) { s->isr |= 1 << irq; - if (s->auto_eoi) { - if (s->rotate_on_auto_eoi) - s->priority_add = (irq + 1) & 7; - pic_clear_isr(s, irq); - } /* * We don't clear a level sensitive interrupt here */ if (!(s->elcr & (1 << irq))) s->irr &= ~(1 << irq); + + if (s->auto_eoi) { + if (s->rotate_on_auto_eoi) + s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); + } + } int kvm_pic_read_irq(struct kvm *kvm) @@ -294,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) priority = get_priority(s, s->isr); if (priority != 8) { irq = (priority + s->priority_add) & 7; - pic_clear_isr(s, irq); if (cmd == 5) s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); } break; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 23c217692ea9..df8bcb0f66d8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -471,11 +471,8 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { - mutex_lock(&apic->vcpu->kvm->irq_lock); + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - mutex_unlock(&apic->vcpu->kvm->irq_lock); - } } static void apic_send_ipi(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1687d12b122a..fdf989f17a61 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2038,9 +2038,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - memcpy(&chip->chip.ioapic, - ioapic_irqchip(kvm), - sizeof(struct kvm_ioapic_state)); + r = kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -2070,11 +2068,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: - mutex_lock(&kvm->irq_lock); - memcpy(ioapic_irqchip(kvm), - &chip->chip.ioapic, - sizeof(struct kvm_ioapic_state)); - mutex_unlock(&kvm->irq_lock); + r = kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; -- cgit v1.2.2 From 680b3648ba89c44ac8d0316f78a0d6e147b88809 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 24 Aug 2009 11:54:26 +0300 Subject: KVM: Drop kvm->irq_lock lock from irq injection path The only thing it protects now is interrupt injection into lapic and this can work lockless. Even now with kvm->irq_lock in place access to lapic is not entirely serialized since vcpu access doesn't take kvm->irq_lock. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 -- arch/x86/kvm/lapic.c | 2 -- arch/x86/kvm/x86.c | 2 -- 3 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 144e7f60b5e2..fab7440c9bb2 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -688,10 +688,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm) struct kvm_vcpu *vcpu; int i; - mutex_lock(&kvm->irq_lock); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - mutex_unlock(&kvm->irq_lock); /* * Provides NMI watchdog support via Virtual Wire mode. diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index df8bcb0f66d8..8787637b1a9c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -501,9 +501,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, irq.vector); - mutex_lock(&apic->vcpu->kvm->irq_lock); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); - mutex_unlock(&apic->vcpu->kvm->irq_lock); } static u32 apic_get_tmcct(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fdf989f17a61..5beb4c16caab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2286,10 +2286,8 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { __s32 status; - mutex_lock(&kvm->irq_lock); status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); - mutex_unlock(&kvm->irq_lock); if (ioctl == KVM_IRQ_LINE_STATUS) { irq_event.status = status; if (copy_to_user(argp, &irq_event, -- cgit v1.2.2 From 367e1319b229110a27c53221c2fa32a6aa86d4a9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 26 Aug 2009 14:57:07 +0300 Subject: KVM: Return -ENOTTY on unrecognized ioctls Not the incorrect -EINVAL. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5beb4c16caab..829e3063e2ab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2176,7 +2176,7 @@ long kvm_arch_vm_ioctl(struct file *filp, { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; - int r = -EINVAL; + int r = -ENOTTY; /* * This union makes it completely explicit to gcc-3.x * that these two variables' stack usage should be -- cgit v1.2.2 From bfd99ff5d483b11c32bca49fbff7a5ac59038b0a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 26 Aug 2009 14:57:50 +0300 Subject: KVM: Move assigned device code to own file Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 0e7fe78d0f74..31a7035c4bd9 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -6,7 +6,8 @@ CFLAGS_svm.o := -I. CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o eventfd.o) + coalesced_mmio.o irq_comm.o eventfd.o \ + assigned-dev.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ -- cgit v1.2.2 From 94677e61fdcf4cdae11a1b7c8974d7034ef9bd1b Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 28 Aug 2009 16:41:44 +0200 Subject: KVM: x86 emulator: Add missing decoder flags for 'or' instructions Add missing decoder flags for or instructions (0xc-0xd). Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0644d3df621a..db0820dfbffe 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -99,7 +99,8 @@ static u32 opcode_table[256] = { /* 0x08 - 0x0F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, ImplicitOps | Stack | No64, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- cgit v1.2.2 From abcf14b560a4ba62c659e6f5aafc8f9934d8c130 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Tue, 1 Sep 2009 15:28:11 +0200 Subject: KVM: x86 emulator: Add pusha and popa instructions This adds pusha and popa instructions (opcodes 0x60-0x61), this enables booting MINIX with invalid guest state emulation on. [marcelo: remove unused variable] Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index db0820dfbffe..d226dff47d77 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -139,7 +139,8 @@ static u32 opcode_table[256] = { DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, /* 0x60 - 0x67 */ - 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, + 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 0, 0, 0, 0, /* 0x68 - 0x6F */ SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, @@ -1225,6 +1226,43 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } +static void emulate_pusha(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + unsigned long old_esp = c->regs[VCPU_REGS_RSP]; + int reg = VCPU_REGS_RAX; + + while (reg <= VCPU_REGS_RDI) { + (reg == VCPU_REGS_RSP) ? + (c->src.val = old_esp) : (c->src.val = c->regs[reg]); + + emulate_push(ctxt); + ++reg; + } +} + +static int emulate_popa(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + int reg = VCPU_REGS_RDI; + + while (reg >= VCPU_REGS_RAX) { + if (reg == VCPU_REGS_RSP) { + register_address_increment(c, &c->regs[VCPU_REGS_RSP], + c->op_bytes); + --reg; + } + + rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); + if (rc != 0) + break; + --reg; + } + return rc; +} + static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1816,6 +1854,14 @@ special_insn: if (rc != 0) goto done; break; + case 0x60: /* pusha */ + emulate_pusha(ctxt); + break; + case 0x61: /* popa */ + rc = emulate_popa(ctxt, ops); + if (rc != 0) + goto done; + break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; -- cgit v1.2.2 From 80ced186d1761d2a66163d9eeb468ddb1f7e0697 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Tue, 1 Sep 2009 12:48:18 +0200 Subject: KVM: VMX: Enhance invalid guest state emulation - Change returned handle_invalid_guest_state() to return relevant exit codes - Move triggering the emulation from vmx_vcpu_run() to vmx_handle_exit() - Return to userspace instead of repeatedly trying to emulate instructions that have already failed Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4635298d000a..73cb5dd960cf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -107,7 +107,6 @@ struct vcpu_vmx { } rmode; int vpid; bool emulation_required; - enum emulation_result invalid_state_emulation_result; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; @@ -3322,35 +3321,37 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } -static void handle_invalid_guest_state(struct kvm_vcpu *vcpu) +static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; - - local_irq_enable(); - preempt_enable(); + int ret = 1; while (!guest_state_valid(vcpu)) { err = emulate_instruction(vcpu, 0, 0, 0); - if (err == EMULATE_DO_MMIO) - break; + if (err == EMULATE_DO_MMIO) { + ret = 0; + goto out; + } if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, "emulation failure"); - break; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + ret = 0; + goto out; } if (signal_pending(current)) - break; + goto out; if (need_resched()) schedule(); } - preempt_disable(); - local_irq_disable(); - - vmx->invalid_state_emulation_result = err; + vmx->emulation_required = 0; +out: + return ret; } /* @@ -3406,13 +3407,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); - /* If we need to emulate an MMIO from handle_invalid_guest_state - * we just return 0 */ - if (vmx->emulation_required && emulate_invalid_guest_state) { - if (guest_state_valid(vcpu)) - vmx->emulation_required = 0; - return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO; - } + /* If guest state is invalid, start emulating */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return handle_invalid_guest_state(vcpu); /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ @@ -3607,11 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); - /* Handle invalid guest state instead of entering VMX */ - if (vmx->emulation_required && emulate_invalid_guest_state) { - handle_invalid_guest_state(vcpu); + /* Don't enter VMX if guest state is invalid, let the exit handler + start emulation until we arrive back to a valid state */ + if (vmx->emulation_required && emulate_invalid_guest_state) return; - } if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); -- cgit v1.2.2 From e8b3433a5c062e94e34cadb6144c10689a497bc3 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 8 Sep 2009 14:47:38 -0300 Subject: KVM: SVM: remove needless mmap_sem acquision from nested_svm_map nested_svm_map unnecessarily takes mmap_sem around gfn_to_page, since gfn_to_page / get_user_pages are responsible for it. Signed-off-by: Marcelo Tosatti Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 92048a626d4e..f54c4f9d2865 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1396,10 +1396,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) { struct page *page; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); - if (is_error_page(page)) goto error; -- cgit v1.2.2 From 10474ae8945ce08622fd1f3464e55bd817bf2376 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 15 Sep 2009 11:37:46 +0200 Subject: KVM: Activate Virtualization On Demand X86 CPUs need to have some magic happening to enable the virtualization extensions on them. This magic can result in unpleasant results for users, like blocking other VMMs from working (vmx) or using invalid TLB entries (svm). Currently KVM activates virtualization when the respective kernel module is loaded. This blocks us from autoloading KVM modules without breaking other VMMs. To circumvent this problem at least a bit, this patch introduces on demand activation of virtualization. This means, that instead virtualization is enabled on creation of the first virtual machine and disabled on destruction of the last one. So using this, KVM can be easily autoloaded, while keeping other hypervisors usable. Signed-off-by: Alexander Graf Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 13 +++++++++---- arch/x86/kvm/vmx.c | 11 ++++++++--- arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 20 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a46e2dd9aca8..295c7c4d9c90 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -459,7 +459,7 @@ struct descriptor_table { struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ - void (*hardware_enable)(void *dummy); /* __init */ + int (*hardware_enable)(void *dummy); void (*hardware_disable)(void *dummy); void (*check_processor_compatibility)(void *rtn); int (*hardware_setup)(void); /* __init */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f54c4f9d2865..59fe4d54da11 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -316,7 +316,7 @@ static void svm_hardware_disable(void *garbage) cpu_svm_disable(); } -static void svm_hardware_enable(void *garbage) +static int svm_hardware_enable(void *garbage) { struct svm_cpu_data *svm_data; @@ -325,16 +325,20 @@ static void svm_hardware_enable(void *garbage) struct desc_struct *gdt; int me = raw_smp_processor_id(); + rdmsrl(MSR_EFER, efer); + if (efer & EFER_SVME) + return -EBUSY; + if (!has_svm()) { printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); - return; + return -EINVAL; } svm_data = per_cpu(svm_data, me); if (!svm_data) { printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", me); - return; + return -EINVAL; } svm_data->asid_generation = 1; @@ -345,11 +349,12 @@ static void svm_hardware_enable(void *garbage) gdt = (struct desc_struct *)gdt_descr.base; svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); - rdmsrl(MSR_EFER, efer); wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(svm_data->save_area) << PAGE_SHIFT); + + return 0; } static void svm_cpu_uninit(int cpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 73cb5dd960cf..a187570e4837 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1138,12 +1138,15 @@ static __init int vmx_disabled_by_bios(void) /* locked but not enabled */ } -static void hardware_enable(void *garbage) +static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old; + if (read_cr4() & X86_CR4_VMXE) + return -EBUSY; + INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); if ((old & (FEATURE_CONTROL_LOCKED | @@ -1158,6 +1161,10 @@ static void hardware_enable(void *garbage) asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc"); + + ept_sync_global(); + + return 0; } static void vmclear_local_vcpus(void) @@ -4040,8 +4047,6 @@ static int __init vmx_init(void) if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); - ept_sync_global(); - return 0; out3: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 829e3063e2ab..3d83de8bcbf4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4691,9 +4691,9 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) return kvm_x86_ops->vcpu_reset(vcpu); } -void kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void *garbage) { - kvm_x86_ops->hardware_enable(garbage); + return kvm_x86_ops->hardware_enable(garbage); } void kvm_arch_hardware_disable(void *garbage) -- cgit v1.2.2 From bfc33beaed3ecf0f92612dc7fb7095029ae7722e Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Wed, 16 Sep 2009 21:09:39 +0800 Subject: KVM: remove duplicated #include Remove duplicated #include('s) in arch/x86/kvm/lapic.c Signed-off-by: Huang Weiyi Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8787637b1a9c..cd60c0bd1b32 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -32,7 +32,6 @@ #include #include #include -#include #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" -- cgit v1.2.2 From 7fcdb5103d10d2eb75876637a2efa9679cce14d3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 16 Sep 2009 15:24:15 +0200 Subject: KVM: SVM: reorganize svm_interrupt_allowed This patch reorganizes the logic in svm_interrupt_allowed to make it better to read. This is important because the logic is a lot more complicated with Nested SVM. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 59fe4d54da11..3f3fe815c21b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2472,10 +2472,18 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - return (vmcb->save.rflags & X86_EFLAGS_IF) && - !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - gif_set(svm) && - !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); + int ret; + + if (!gif_set(svm) || + (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) + return 0; + + ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); + + if (is_nested(svm)) + return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); + + return ret; } static void enable_irq_window(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 33527ad7e12a5cb50b39165945464600ab2f7632 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 16 Sep 2009 15:24:16 +0200 Subject: KVM: SVM: don't copy exit_int_info on nested vmrun The exit_int_info field is only written by the hardware and never read. So it does not need to be copied on a vmrun emulation. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3f3fe815c21b..41c996ab87e9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1797,8 +1797,6 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->nested.intercept = nested_vmcb->control.intercept; force_new_asid(&svm->vcpu); - svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; - svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err; svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; if (nested_vmcb->control.int_ctl & V_IRQ_MASK) { nsvm_printk("nSVM Injecting Interrupt: 0x%x\n", -- cgit v1.2.2 From e935d48e1b49451490218e1181d9834176200955 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 16 Sep 2009 15:24:19 +0200 Subject: KVM: SVM: Remove remaining occurences of rdtscll This patch replaces them with native_read_tsc() which can also be used in expressions and saves a variable on the stack in this case. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41c996ab87e9..9a4dacab6d8a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -763,14 +763,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int i; if (unlikely(cpu != vcpu->cpu)) { - u64 tsc_this, delta; + u64 delta; /* * Make sure that the guest sees a monotonically * increasing TSC. */ - rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; + delta = vcpu->arch.host_tsc - native_read_tsc(); svm->vmcb->control.tsc_offset += delta; if (is_nested(svm)) svm->nested.hsave->control.tsc_offset += delta; @@ -792,7 +791,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - rdtscll(vcpu->arch.host_tsc); + vcpu->arch.host_tsc = native_read_tsc(); } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From b820cc0ca20fdcf8014d8e57421cf29095e39392 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 29 Sep 2009 11:38:34 -1000 Subject: KVM: Separate timer intialization into an indepedent function Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d83de8bcbf4..6a31dfb8849c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3118,9 +3118,22 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = { .notifier_call = kvmclock_cpufreq_notifier }; +static void kvm_timer_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + tsc_khz_ref = tsc_khz; + cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } +} + int kvm_arch_init(void *opaque) { - int r, cpu; + int r; struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; if (kvm_x86_ops) { @@ -3152,13 +3165,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - tsc_khz_ref = tsc_khz; - cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - } + kvm_timer_init(); return 0; -- cgit v1.2.2 From 0cca790753bf0cab4b070801a46df8e1297c17f6 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 29 Sep 2009 11:38:35 -1000 Subject: KVM: Kill the confusing tsc_ref_khz and ref_freq variables They are globals, not clearly protected by any ordering or locking, and vulnerable to various startup races. Instead, for variable TSC machines, register the cpufreq notifier and get the TSC frequency directly from the cpufreq machinery. Not only is it always right, it is also perfectly accurate, as no error prone measurement is required. On such machines, when a new CPU online is brought online, it isn't clear what frequency it will start with, and it may not correspond to the reference, thus in hardware_enable we clear the cpu_tsc_khz variable to zero and make sure it is set before running on a VCPU. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6a31dfb8849c..6f758567831a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1326,6 +1326,8 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); + if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) + per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu); kvm_request_guest_time_update(vcpu); } @@ -3063,9 +3065,6 @@ static void bounce_off(void *info) /* nothing */ } -static unsigned int ref_freq; -static unsigned long tsc_khz_ref; - static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -3074,14 +3073,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm_vcpu *vcpu; int i, send_ipi = 0; - if (!ref_freq) - ref_freq = freq->old; - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) return 0; if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) return 0; - per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { @@ -3122,12 +3118,14 @@ static void kvm_timer_init(void) { int cpu; - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - tsc_khz_ref = tsc_khz; cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + for_each_online_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = cpufreq_get(cpu); + } else { + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; } } @@ -4700,6 +4698,14 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) int kvm_arch_hardware_enable(void *garbage) { + /* + * Since this may be called from a hotplug notifcation, + * we can't get the CPU frequency directly. + */ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + int cpu = raw_smp_processor_id(); + per_cpu(cpu_tsc_khz, cpu) = 0; + } return kvm_x86_ops->hardware_enable(garbage); } -- cgit v1.2.2 From e6732a5af9dfcc87078706a1598df0efe5010f73 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 29 Sep 2009 11:38:36 -1000 Subject: KVM: Fix printk name error in svm.c Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9a4dacab6d8a..d1036ce8917e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -330,13 +330,14 @@ static int svm_hardware_enable(void *garbage) return -EBUSY; if (!has_svm()) { - printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); + printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", + me); return -EINVAL; } svm_data = per_cpu(svm_data, me); if (!svm_data) { - printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", + printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", me); return -EINVAL; } -- cgit v1.2.2 From 3230bb4707278dba25e24cd0a11ea7b2337678ee Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 29 Sep 2009 11:38:37 -1000 Subject: KVM: Fix hotplug of CPUs Both VMX and SVM require per-cpu memory allocation, which is done at module init time, for only online cpus. Backend was not allocating enough structure for all possible CPUs, so new CPUs coming online could not be hardware enabled. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d1036ce8917e..02a4269be645 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -482,7 +482,7 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_SVME); } - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) goto err; @@ -516,7 +516,7 @@ static __exit void svm_hardware_unsetup(void) { int cpu; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a187570e4837..97f4265cda38 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1350,15 +1350,17 @@ static void free_kvm_area(void) { int cpu; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) { free_vmcs(per_cpu(vmxarea, cpu)); + per_cpu(vmxarea, cpu) = NULL; + } } static __init int alloc_kvm_area(void) { int cpu; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct vmcs *vmcs; vmcs = alloc_vmcs_cpu(cpu); -- cgit v1.2.2 From 201d945bcfb0d53e67c9c081f7c28532eb4669c7 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 30 Sep 2009 17:39:07 +0200 Subject: KVM: remove pre_task_link setting in save_state_to_tss16 Now, also remove pre_task_link setting in save_state_to_tss16. commit b237ac37a149e8b56436fabf093532483bff13b0 Author: Gleb Natapov Date: Mon Mar 30 16:03:24 2009 +0300 KVM: Fix task switch back link handling. CC: Gleb Natapov Signed-off-by: Juan Quintela Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f758567831a..5f44d565cc9b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4203,7 +4203,6 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); - tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); } static int load_state_from_tss16(struct kvm_vcpu *vcpu, -- cgit v1.2.2 From 355be0b9300579e02275d7d19374806a974ce622 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 3 Oct 2009 00:31:21 +0200 Subject: KVM: x86: Refactor guest debug IOCTL handling Much of so far vendor-specific code for setting up guest debug can actually be handled by the generic code. This also fixes a minor deficit in the SVM part /wrt processing KVM_GUESTDBG_ENABLE. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/svm.c | 14 ++------------ arch/x86/kvm/vmx.c | 18 +----------------- arch/x86/kvm/x86.c | 28 +++++++++++++++++++++------- 4 files changed, 26 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 295c7c4d9c90..e7f870832603 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -475,8 +475,8 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - int (*set_guest_debug)(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg); + void (*set_guest_debug)(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 02a4269be645..279a2ae21b4f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1065,26 +1065,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu) vcpu->guest_debug = 0; } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int old_debug = vcpu->guest_debug; struct vcpu_svm *svm = to_svm(vcpu); - vcpu->guest_debug = dbg->control; - - update_db_intercept(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; else svm->vmcb->save.dr7 = vcpu->arch.dr7; - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - else if (old_debug & KVM_GUESTDBG_SINGLESTEP) - svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - - return 0; + update_db_intercept(vcpu); } static void load_host_msrs(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 97f4265cda38..70020e505c22 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1096,30 +1096,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int old_debug = vcpu->guest_debug; - unsigned long flags; - - vcpu->guest_debug = dbg->control; - if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) - vcpu->guest_debug = 0; - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); else vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - flags = vmcs_readl(GUEST_RFLAGS); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - else if (old_debug & KVM_GUESTDBG_SINGLESTEP) - flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - vmcs_writel(GUEST_RFLAGS, flags); - update_exception_bitmap(vcpu); - - return 0; } static __init int cpu_has_kvm_support(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5f44d565cc9b..a06f88e66c89 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4472,12 +4472,19 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int i, r; + unsigned long rflags; + int old_debug; + int i; vcpu_load(vcpu); - if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { + old_debug = vcpu->guest_debug; + + vcpu->guest_debug = dbg->control; + if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) + vcpu->guest_debug = 0; + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { for (i = 0; i < KVM_NR_DB_REGS; ++i) vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; vcpu->arch.switch_db_regs = @@ -4488,16 +4495,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } - r = kvm_x86_ops->set_guest_debug(vcpu, dbg); + rflags = kvm_x86_ops->get_rflags(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + else if (old_debug & KVM_GUESTDBG_SINGLESTEP) + rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + kvm_x86_ops->set_rflags(vcpu, rflags); - if (dbg->control & KVM_GUESTDBG_INJECT_DB) + kvm_x86_ops->set_guest_debug(vcpu, dbg); + + if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); - else if (dbg->control & KVM_GUESTDBG_INJECT_BP) + else if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_BP) kvm_queue_exception(vcpu, BP_VECTOR); vcpu_put(vcpu); - return r; + return 0; } /* -- cgit v1.2.2 From a68a6a7282373bedba8a2ed751b6384edb983a64 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 1 Oct 2009 19:28:39 -0300 Subject: KVM: x86: disable paravirt mmu reporting Disable paravirt MMU capability reporting, so that new (or rebooted) guests switch to native operation. Paravirt MMU is a burden to maintain and does not bring significant advantages compared to shadow anymore. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a06f88e66c89..4693f915f3bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1238,8 +1238,8 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_NR_MEMSLOTS: r = KVM_MEMORY_SLOTS; break; - case KVM_CAP_PV_MMU: - r = !tdp_enabled; + case KVM_CAP_PV_MMU: /* obsolete */ + r = 0; break; case KVM_CAP_IOMMU: r = iommu_found(); -- cgit v1.2.2 From 91586a3b7d79432772a3cdcb81473cd08a237c79 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 5 Oct 2009 13:07:21 +0200 Subject: KVM: x86: Rework guest single-step flag injection and filtering Push TF and RF injection and filtering on guest single-stepping into the vender get/set_rflags callbacks. This makes the whole mechanism more robust wrt user space IOCTL order and instruction emulations. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/x86.c | 77 ++++++++++++++++++++++++----------------- 2 files changed, 48 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e7f870832603..179a919f53a4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -614,6 +614,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); +unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4693f915f3bd..385cd0a1e23d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -235,6 +235,25 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) } EXPORT_SYMBOL_GPL(kvm_require_cpl); +unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags; + + rflags = kvm_x86_ops->get_rflags(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); + return rflags; +} +EXPORT_SYMBOL_GPL(kvm_get_rflags); + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + kvm_x86_ops->set_rflags(vcpu, rflags); +} +EXPORT_SYMBOL_GPL(kvm_set_rflags); + /* * Load the pae pdptrs. Return true is they are all valid. */ @@ -2777,7 +2796,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_REAL : cs_l @@ -2855,7 +2874,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (vcpu->mmio_is_write) { vcpu->mmio_needed = 0; @@ -3291,7 +3310,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, unsigned long *rflags) { kvm_lmsw(vcpu, msw); - *rflags = kvm_x86_ops->get_rflags(vcpu); + *rflags = kvm_get_rflags(vcpu); } unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) @@ -3329,7 +3348,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, switch (cr) { case 0: kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); - *rflags = kvm_x86_ops->get_rflags(vcpu); + *rflags = kvm_get_rflags(vcpu); break; case 2: vcpu->arch.cr2 = val; @@ -3460,7 +3479,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); if (irqchip_in_kernel(vcpu->kvm)) @@ -3840,13 +3859,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) #endif regs->rip = kvm_rip_read(vcpu); - regs->rflags = kvm_x86_ops->get_rflags(vcpu); - - /* - * Don't leak debug flags in case they were set for guest debugging - */ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + regs->rflags = kvm_get_rflags(vcpu); vcpu_put(vcpu); @@ -3874,12 +3887,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); - #endif kvm_rip_write(vcpu, regs->rip); - kvm_x86_ops->set_rflags(vcpu, regs->rflags); - + kvm_set_rflags(vcpu, regs->rflags); vcpu->arch.exception.pending = false; @@ -4098,7 +4109,7 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) { return (seg != VCPU_SREG_LDTR) && (seg != VCPU_SREG_TR) && - (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); + (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, @@ -4126,7 +4137,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, { tss->cr3 = vcpu->arch.cr3; tss->eip = kvm_rip_read(vcpu); - tss->eflags = kvm_x86_ops->get_rflags(vcpu); + tss->eflags = kvm_get_rflags(vcpu); tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); @@ -4150,7 +4161,7 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, kvm_set_cr3(vcpu, tss->cr3); kvm_rip_write(vcpu, tss->eip); - kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); + kvm_set_rflags(vcpu, tss->eflags | 2); kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); @@ -4188,7 +4199,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { tss->ip = kvm_rip_read(vcpu); - tss->flag = kvm_x86_ops->get_rflags(vcpu); + tss->flag = kvm_get_rflags(vcpu); tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); @@ -4209,7 +4220,7 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { kvm_rip_write(vcpu, tss->ip); - kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); + kvm_set_rflags(vcpu, tss->flag | 2); kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); @@ -4355,8 +4366,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } if (reason == TASK_SWITCH_IRET) { - u32 eflags = kvm_x86_ops->get_rflags(vcpu); - kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); + u32 eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); } /* set back link to prev task only if NT bit is set in eflags @@ -4377,8 +4388,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) old_tss_base, &nseg_desc); if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { - u32 eflags = kvm_x86_ops->get_rflags(vcpu); - kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); + u32 eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); } if (reason != TASK_SWITCH_IRET) { @@ -4473,12 +4484,15 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { unsigned long rflags; - int old_debug; int i; vcpu_load(vcpu); - old_debug = vcpu->guest_debug; + /* + * Read rflags as long as potentially injected trace flags are still + * filtered out. + */ + rflags = kvm_get_rflags(vcpu); vcpu->guest_debug = dbg->control; if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) @@ -4495,12 +4509,11 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } - rflags = kvm_x86_ops->get_rflags(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - else if (old_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - kvm_x86_ops->set_rflags(vcpu, rflags); + /* + * Trigger an rflags update that will inject or remove the trace + * flags. + */ + kvm_set_rflags(vcpu, rflags); kvm_x86_ops->set_guest_debug(vcpu, dbg); -- cgit v1.2.2 From e3267cbbbfbcbe9c18833e89b10beabb1117cb55 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 6 Oct 2009 13:24:50 -0400 Subject: KVM: x86: include pvclock MSRs in msrs_to_save For a while now, we are issuing a rdmsr instruction to find out which msrs in our save list are really supported by the underlying machine. However, it fails to account for kvm-specific msrs, such as the pvclock ones. This patch moves then to the beginning of the list, and skip testing them. Cc: stable@kernel.org Signed-off-by: Glauber Costa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 385cd0a1e23d..4de5bc0a8e86 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -503,16 +503,19 @@ static inline u32 bit(int bitno) * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * * This list is modified at module load time to reflect the - * capabilities of the host cpu. + * capabilities of the host cpu. This capabilities test skips MSRs that are + * kvm-specific. Those are put in the beginning of the list. */ + +#define KVM_SAVE_MSRS_BEGIN 2 static u32 msrs_to_save[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; @@ -2446,7 +2449,8 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i, j; - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { + /* skip the first msrs in the list. KVM-specific */ + for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; if (j < i) -- cgit v1.2.2 From 8d23c4662427507f432c96ac4fa3b76f0a8360cd Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 Oct 2009 16:08:25 +0200 Subject: KVM: SVM: Notify nested hypervisor of lost event injections If event_inj is valid on a #vmexit the host CPU would write the contents to exit_int_info, so the hypervisor knows that the event wasn't injected. We don't do this in nested SVM by now which is a bug and fixed by this patch. Signed-off-by: Alexander Graf Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 279a2ae21b4f..e37285446cb7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1615,6 +1615,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + + /* + * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have + * to make sure that we do not lose injected events. So check event_inj + * here and copy it to exit_int_info if it is valid. + * Exit_int_info and event_inj can't be both valid because the case + * below only happens on a VMRUN instruction intercept which has + * no valid exit_int_info set. + */ + if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { + struct vmcb_control_area *nc = &nested_vmcb->control; + + nc->exit_int_info = vmcb->control.event_inj; + nc->exit_int_info_err = vmcb->control.event_inj_err; + } + nested_vmcb->control.tlb_ctl = 0; nested_vmcb->control.event_inj = 0; nested_vmcb->control.event_inj_err = 0; -- cgit v1.2.2 From cd3ff653ae0b45bac7a19208e9c75034fcacc85f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 16:08:26 +0200 Subject: KVM: SVM: Move INTR vmexit out of atomic code The nested SVM code emulates a #vmexit caused by a request to open the irq window right in the request function. This is a bug because the request function runs with preemption and interrupts disabled but the #vmexit emulation might sleep. This can cause a schedule()-while-atomic bug and is fixed with this patch. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e37285446cb7..884bffc70c7f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -85,6 +85,9 @@ struct nested_state { /* gpa pointers to the real vectors */ u64 vmcb_msrpm; + /* A VMEXIT is required but not yet emulated */ + bool exit_required; + /* cache for intercepts of the guest */ u16 intercept_cr_read; u16 intercept_cr_write; @@ -1379,7 +1382,14 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) svm->vmcb->control.exit_code = SVM_EXIT_INTR; - if (nested_svm_exit_handled(svm)) { + if (svm->nested.intercept & 1ULL) { + /* + * The #vmexit can't be emulated here directly because this + * code path runs with irqs and preemtion disabled. A + * #vmexit emulation might sleep. Only signal request for + * the #vmexit here. + */ + svm->nested.exit_required = true; nsvm_printk("VMexit -> INTR\n"); return 1; } @@ -2340,6 +2350,13 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, svm->vmcb->save.rip); + if (unlikely(svm->nested.exit_required)) { + nested_svm_vmexit(svm); + svm->nested.exit_required = false; + + return 1; + } + if (is_nested(svm)) { int vmexit; @@ -2615,6 +2632,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) u16 gs_selector; u16 ldt_selector; + /* + * A vmexit emulation is required before the vcpu can be executed + * again. + */ + if (unlikely(svm->nested.exit_required)) + return; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; -- cgit v1.2.2 From 0ac406de8f3780c8e0801d5719e1ec531d4a6ec4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 16:08:27 +0200 Subject: KVM: SVM: Add tracepoint for nested vmrun This patch adds a dedicated kvm tracepoint for a nested vmrun. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/trace.h | 33 +++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 40 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 884bffc70c7f..907af3f3a7af 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1726,6 +1726,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) /* nested_vmcb is our indicator if nested SVM is activated */ svm->nested.vmcb = svm->vmcb->save.rax; + trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb, + nested_vmcb->save.rip, + nested_vmcb->control.int_ctl, + nested_vmcb->control.event_inj, + nested_vmcb->control.nested_ctl); + /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0d480e77eacf..b5798e12182a 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -349,6 +349,39 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->coalesced ? " (coalesced)" : "") ); +/* + * Tracepoint for nested VMRUN + */ +TRACE_EVENT(kvm_nested_vmrun, + TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl, + __u32 event_inj, bool npt), + TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u64, vmcb ) + __field( __u64, nested_rip ) + __field( __u32, int_ctl ) + __field( __u32, event_inj ) + __field( bool, npt ) + ), + + TP_fast_assign( + __entry->rip = rip; + __entry->vmcb = vmcb; + __entry->nested_rip = nested_rip; + __entry->int_ctl = int_ctl; + __entry->event_inj = event_inj; + __entry->npt = npt; + ), + + TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " + "event_inj: 0x%08x npt: %s\n", + __entry->rip, __entry->vmcb, __entry->nested_rip, + __entry->int_ctl, __entry->event_inj, + __entry->npt ? "on" : "off") +); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4de5bc0a8e86..3ab2f9042dd0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4984,3 +4984,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); -- cgit v1.2.2 From d8cabddf7e8fbdced2dd668c98d7762c7ef75245 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 16:08:28 +0200 Subject: KVM: SVM: Add tracepoint for nested #vmexit This patch adds a tracepoint for every #vmexit we get from a nested guest. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/trace.h | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 907af3f3a7af..edf6e8b2b84e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2366,6 +2366,12 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (is_nested(svm)) { int vmexit; + trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, + svm->vmcb->control.exit_info_1, + svm->vmcb->control.exit_info_2, + svm->vmcb->control.exit_int_info, + svm->vmcb->control.exit_int_info_err); + nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", exit_code, svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b5798e12182a..a7eb6299a261 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -382,6 +382,42 @@ TRACE_EVENT(kvm_nested_vmrun, __entry->npt ? "on" : "off") ); +/* + * Tracepoint for #VMEXIT while nested + */ +TRACE_EVENT(kvm_nested_vmexit, + TP_PROTO(__u64 rip, __u32 exit_code, + __u64 exit_info1, __u64 exit_info2, + __u32 exit_int_info, __u32 exit_int_info_err), + TP_ARGS(rip, exit_code, exit_info1, exit_info2, + exit_int_info, exit_int_info_err), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u32, exit_code ) + __field( __u64, exit_info1 ) + __field( __u64, exit_info2 ) + __field( __u32, exit_int_info ) + __field( __u32, exit_int_info_err ) + ), + + TP_fast_assign( + __entry->rip = rip; + __entry->exit_code = exit_code; + __entry->exit_info1 = exit_info1; + __entry->exit_info2 = exit_info2; + __entry->exit_int_info = exit_int_info; + __entry->exit_int_info_err = exit_int_info_err; + ), + TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", + __entry->rip, + ftrace_print_symbols_seq(p, __entry->exit_code, + kvm_x86_ops->exit_reasons_str), + __entry->exit_info1, __entry->exit_info2, + __entry->exit_int_info, __entry->exit_int_info_err) +); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3ab2f9042dd0..192d58efc6dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4985,3 +4985,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); -- cgit v1.2.2 From 17897f366847a9ef8a13e3671a0eb1c15422abed Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 16:08:29 +0200 Subject: KVM: SVM: Add tracepoint for injected #vmexit This patch adds a tracepoint for a nested #vmexit that gets re-injected to the guest. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/trace.h | 33 +++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 40 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index edf6e8b2b84e..369eeb86e87c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1592,6 +1592,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, + vmcb->control.exit_info_1, + vmcb->control.exit_info_2, + vmcb->control.exit_int_info, + vmcb->control.exit_int_info_err); + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); if (!nested_vmcb) return 1; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a7eb6299a261..4d6bb5ee39b5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -418,6 +418,39 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_int_info, __entry->exit_int_info_err) ); +/* + * Tracepoint for #VMEXIT reinjected to the guest + */ +TRACE_EVENT(kvm_nested_vmexit_inject, + TP_PROTO(__u32 exit_code, + __u64 exit_info1, __u64 exit_info2, + __u32 exit_int_info, __u32 exit_int_info_err), + TP_ARGS(exit_code, exit_info1, exit_info2, + exit_int_info, exit_int_info_err), + + TP_STRUCT__entry( + __field( __u32, exit_code ) + __field( __u64, exit_info1 ) + __field( __u64, exit_info2 ) + __field( __u32, exit_int_info ) + __field( __u32, exit_int_info_err ) + ), + + TP_fast_assign( + __entry->exit_code = exit_code; + __entry->exit_info1 = exit_info1; + __entry->exit_info2 = exit_info2; + __entry->exit_int_info = exit_int_info; + __entry->exit_int_info_err = exit_int_info_err; + ), + + TP_printk("reason: %s ext_inf1: 0x%016llx " + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", + ftrace_print_symbols_seq(p, __entry->exit_code, + kvm_x86_ops->exit_reasons_str), + __entry->exit_info1, __entry->exit_info2, + __entry->exit_int_info, __entry->exit_int_info_err) +); #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 192d58efc6dc..a522d9ba81b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4986,3 +4986,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); -- cgit v1.2.2 From 236649de3360916ef85f95c82723af17a25b9179 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 16:08:30 +0200 Subject: KVM: SVM: Add tracepoint for #vmexit because intr pending This patch adds a special tracepoint for the event that a nested #vmexit is injected because kvm wants to inject an interrupt into the guest. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/trace.h | 18 ++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 369eeb86e87c..78a391c60a75 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1390,7 +1390,7 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) * the #vmexit here. */ svm->nested.exit_required = true; - nsvm_printk("VMexit -> INTR\n"); + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); return 1; } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4d6bb5ee39b5..3cc8f444be14 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -451,6 +451,24 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); + +/* + * Tracepoint for nested #vmexit because of interrupt pending + */ +TRACE_EVENT(kvm_nested_intr_vmexit, + TP_PROTO(__u64 rip), + TP_ARGS(rip), + + TP_STRUCT__entry( + __field( __u64, rip ) + ), + + TP_fast_assign( + __entry->rip = rip + ), + + TP_printk("rip: 0x%016llx\n", __entry->rip) +); #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a522d9ba81b3..2cf4146b425a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4987,3 +4987,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); -- cgit v1.2.2 From ec1ff79084fccdae0dca9b04b89dcdf3235bbfa1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 16:08:31 +0200 Subject: KVM: SVM: Add tracepoint for invlpga instruction This patch adds a tracepoint for the event that the guest executed the INVLPGA instruction. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 3 +++ arch/x86/kvm/trace.h | 23 +++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 78a391c60a75..ba18fb7d3657 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1976,6 +1976,9 @@ static int invlpga_interception(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; nsvm_printk("INVLPGA\n"); + trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], + vcpu->arch.regs[VCPU_REGS_RAX]); + /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 3cc8f444be14..7e1f08e959bc 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -469,6 +469,29 @@ TRACE_EVENT(kvm_nested_intr_vmexit, TP_printk("rip: 0x%016llx\n", __entry->rip) ); + +/* + * Tracepoint for nested #vmexit because of interrupt pending + */ +TRACE_EVENT(kvm_invlpga, + TP_PROTO(__u64 rip, int asid, u64 address), + TP_ARGS(rip, asid, address), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( int, asid ) + __field( __u64, address ) + ), + + TP_fast_assign( + __entry->rip = rip; + __entry->asid = asid; + __entry->address = address; + ), + + TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", + __entry->rip, __entry->asid, __entry->address) +); #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2cf4146b425a..86596fc7941c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4988,3 +4988,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); -- cgit v1.2.2 From 532a46b98963f110e9425a251e127d6537915dde Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 16:08:32 +0200 Subject: KVM: SVM: Add tracepoint for skinit instruction This patch adds a tracepoint for the event that the guest executed the SKINIT instruction. This information is important because SKINIT is an SVM extenstion not yet implemented by nested SVM and we may need this information for debugging hypervisors that do not yet run on nested SVM. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 10 +++++++++- arch/x86/kvm/trace.h | 22 ++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ba18fb7d3657..8b9f6fbba48c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1987,6 +1987,14 @@ static int invlpga_interception(struct vcpu_svm *svm) return 1; } +static int skinit_interception(struct vcpu_svm *svm) +{ + trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]); + + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + static int invalid_op_interception(struct vcpu_svm *svm) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); @@ -2350,7 +2358,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_VMSAVE] = vmsave_interception, [SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_CLGI] = clgi_interception, - [SVM_EXIT_SKINIT] = invalid_op_interception, + [SVM_EXIT_SKINIT] = skinit_interception, [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, [SVM_EXIT_MWAIT] = invalid_op_interception, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7e1f08e959bc..816e0449db0b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -492,6 +492,28 @@ TRACE_EVENT(kvm_invlpga, TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", __entry->rip, __entry->asid, __entry->address) ); + +/* + * Tracepoint for nested #vmexit because of interrupt pending + */ +TRACE_EVENT(kvm_skinit, + TP_PROTO(__u64 rip, __u32 slb), + TP_ARGS(rip, slb), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u32, slb ) + ), + + TP_fast_assign( + __entry->rip = rip; + __entry->slb = slb; + ), + + TP_printk("rip: 0x%016llx slb: 0x%08x\n", + __entry->rip, __entry->slb) +); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 86596fc7941c..098e7f886306 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4989,3 +4989,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); -- cgit v1.2.2 From d36f19e9ecd22dc035ef4cc6361b564be650f8e7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 16:08:33 +0200 Subject: KVM: SVM: Remove nsvm_printk debugging code With all important informations now delivered through tracepoints we can savely remove the nsvm_printk debugging code for nested svm. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8b9f6fbba48c..69610c5d6dea 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -53,15 +53,6 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) -/* Turn on to get debugging output*/ -/* #define NESTED_DEBUG */ - -#ifdef NESTED_DEBUG -#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args) -#else -#define nsvm_printk(fmt, args...) do {} while(0) -#endif - static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, @@ -1540,14 +1531,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm) } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - nsvm_printk("exit code: 0x%x\n", exit_code); if (svm->nested.intercept & exit_bits) vmexit = NESTED_EXIT_DONE; } } if (vmexit == NESTED_EXIT_DONE) { - nsvm_printk("#VMEXIT reason=%04x\n", exit_code); nested_svm_vmexit(svm); } @@ -1658,10 +1647,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) /* Restore the original control entries */ copy_vmcb_control_area(vmcb, hsave); - /* Kill any pending exceptions */ - if (svm->vcpu.arch.exception.pending == true) - nsvm_printk("WARNING: Pending Exception\n"); - kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); @@ -1826,25 +1811,14 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) force_new_asid(&svm->vcpu); svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; - if (nested_vmcb->control.int_ctl & V_IRQ_MASK) { - nsvm_printk("nSVM Injecting Interrupt: 0x%x\n", - nested_vmcb->control.int_ctl); - } if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; else svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; - nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n", - nested_vmcb->control.exit_int_info, - nested_vmcb->control.int_state); - svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; - if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID) - nsvm_printk("Injecting Event: 0x%x\n", - nested_vmcb->control.event_inj); svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; @@ -1913,8 +1887,6 @@ static int vmsave_interception(struct vcpu_svm *svm) static int vmrun_interception(struct vcpu_svm *svm) { - nsvm_printk("VMrun\n"); - if (nested_svm_check_permissions(svm)) return 1; @@ -1974,7 +1946,6 @@ static int clgi_interception(struct vcpu_svm *svm) static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - nsvm_printk("INVLPGA\n"); trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], vcpu->arch.regs[VCPU_REGS_RAX]); @@ -2389,10 +2360,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm->vmcb->control.exit_int_info, svm->vmcb->control.exit_int_info_err); - nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", - exit_code, svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); - vmexit = nested_svm_exit_special(svm); if (vmexit == NESTED_EXIT_CONTINUE) @@ -2539,7 +2506,6 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - nsvm_printk("Trying to open IRQ window\n"); nested_svm_intr(svm); -- cgit v1.2.2 From 4b8d54f9726f1159330201c5ed2ea30bce7e63ea Mon Sep 17 00:00:00 2001 From: "Zhai, Edwin" Date: Fri, 9 Oct 2009 18:03:20 +0800 Subject: KVM: VMX: Add support for Pause-Loop Exiting New NHM processors will support Pause-Loop Exiting by adding 2 VM-execution control fields: PLE_Gap - upper bound on the amount of time between two successive executions of PAUSE in a loop. PLE_Window - upper bound on the amount of time a guest is allowed to execute in a PAUSE loop If the time, between this execution of PAUSE and previous one, exceeds the PLE_Gap, processor consider this PAUSE belongs to a new loop. Otherwise, processor determins the the total execution time of this loop(since 1st PAUSE in this loop), and triggers a VM exit if total time exceeds the PLE_Window. * Refer SDM volume 3b section 21.6.13 & 22.1.3. Pause-Loop Exiting can be used to detect Lock-Holder Preemption, where one VP is sched-out after hold a spinlock, then other VPs for same lock are sched-in to waste the CPU time. Our tests indicate that most spinlocks are held for less than 212 cycles. Performance tests show that with 2X LP over-commitment we can get +2% perf improvement for kernel build(Even more perf gain with more LPs). Signed-off-by: Zhai Edwin Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 4 ++++ arch/x86/kvm/vmx.c | 51 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 272514c2d456..2b4945419a84 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -56,6 +56,7 @@ #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -144,6 +145,8 @@ enum vmcs_field { VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, TPR_THRESHOLD = 0x0000401c, SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + PLE_GAP = 0x00004020, + PLE_WINDOW = 0x00004022, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -248,6 +251,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 70020e505c22..a4580d65af59 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,6 +61,25 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually small than 41 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; +module_param(ple_gap, int, S_IRUGO); + +static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, int, S_IRUGO); + struct vmcs { u32 revision_id; u32 abort; @@ -319,6 +338,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void) SECONDARY_EXEC_UNRESTRICTED_GUEST; } +static inline int cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -1240,7 +1265,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST; + SECONDARY_EXEC_UNRESTRICTED_GUEST | + SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1386,6 +1412,9 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); + if (!cpu_has_vmx_ple()) + ple_gap = 0; + return alloc_kvm_area(); } @@ -2298,9 +2327,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (!ple_gap) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } + if (ple_gap) { + vmcs_write32(PLE_GAP, ple_gap); + vmcs_write32(PLE_WINDOW, ple_window); + } + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -3347,6 +3383,18 @@ out: return ret; } +/* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + skip_emulated_instruction(vcpu); + kvm_vcpu_on_spin(vcpu); + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -3383,6 +3431,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, }; static const int kvm_vmx_max_exit_handlers = -- cgit v1.2.2 From 565d0998ecac8373b9a9ecd5991abe74318cd235 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Tue, 6 Oct 2009 14:25:02 -0500 Subject: KVM: SVM: Support Pause Filter in AMD processors New AMD processors (Family 0x10 models 8+) support the Pause Filter Feature. This feature creates a new field in the VMCB called Pause Filter Count. If Pause Filter Count is greater than 0 and intercepting PAUSEs is enabled, the processor will increment an internal counter when a PAUSE instruction occurs instead of intercepting. When the internal counter reaches the Pause Filter Count value, a PAUSE intercept will occur. This feature can be used to detect contended spinlocks, especially when the lock holding VCPU is not scheduled. Rescheduling another VCPU prevents the VCPU seeking the lock from wasting its quantum by spinning idly. Experimental results show that most spinlocks are held for less than 1000 PAUSE cycles or more than a few thousand. Default the Pause Filter Counter to 3000 to detect the contended spinlocks. Processor support for this feature is indicated by a CPUID bit. On a 24 core system running 4 guests each with 16 VCPUs, this patch improved overall performance of each guest's 32 job kernbench by approximately 3-5% when combined with a scheduler algorithm thati caused the VCPU to sleep for a brief period. Further performance improvement may be possible with a more sophisticated yield algorithm. Signed-off-by: Mark Langsdorf Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/svm.h | 3 ++- arch/x86/kvm/svm.c | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 85574b7c1bc1..1fecb7e61130 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u16 intercept_dr_write; u32 intercept_exceptions; u64 intercept; - u8 reserved_1[44]; + u8 reserved_1[42]; + u16 pause_filter_count; u64 iopm_base_pa; u64 msrpm_base_pa; u64 tsc_offset; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 69610c5d6dea..170b2d9c6909 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -46,6 +46,7 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ @@ -654,6 +655,11 @@ static void init_vmcb(struct vcpu_svm *svm) svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; + if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { + control->pause_filter_count = 3000; + control->intercept |= (1ULL << INTERCEPT_PAUSE); + } + enable_gif(svm); } @@ -2281,6 +2287,12 @@ static int interrupt_window_interception(struct vcpu_svm *svm) return 1; } +static int pause_interception(struct vcpu_svm *svm) +{ + kvm_vcpu_on_spin(&(svm->vcpu)); + return 1; +} + static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = emulate_on_interception, [SVM_EXIT_READ_CR3] = emulate_on_interception, @@ -2316,6 +2328,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, + [SVM_EXIT_PAUSE] = pause_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, -- cgit v1.2.2 From 6b7d7e762b238f908fe4c3345c2c6eb5c3fdbd59 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 9 Oct 2009 16:26:08 -1000 Subject: KVM: x86: Harden against cpufreq If cpufreq can't determine the CPU khz, or cpufreq is not compiled in, we should fallback to the measured TSC khz. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 098e7f886306..3cffa2cac70b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1348,8 +1348,12 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) - per_cpu(cpu_tsc_khz, cpu) = cpufreq_quick_get(cpu); + if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { + unsigned long khz = cpufreq_quick_get(cpu); + if (!khz) + khz = tsc_khz; + per_cpu(cpu_tsc_khz, cpu) = khz; + } kvm_request_guest_time_update(vcpu); } @@ -3144,8 +3148,12 @@ static void kvm_timer_init(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - for_each_online_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = cpufreq_get(cpu); + for_each_online_cpu(cpu) { + unsigned long khz = cpufreq_get(cpu); + if (!khz) + khz = tsc_khz; + per_cpu(cpu_tsc_khz, cpu) = khz; + } } else { for_each_possible_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; -- cgit v1.2.2 From 9fb41ba8962b18159e16cac81e7d57e897964038 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 12 Oct 2009 19:37:31 -0300 Subject: KVM: VMX: fix handle_pause declaration There's no kvm_run argument anymore. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a4580d65af59..364263a25ff8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3387,7 +3387,7 @@ out: * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. */ -static int handle_pause(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_pause(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); -- cgit v1.2.2 From 94c30d9ca6fd00a69e367b91b6e13572c41938c5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Oct 2009 08:51:40 +0200 Subject: KVM: x86: Drop unneeded CONFIG_HAS_IOMEM check This (broken) check dates back to the days when this code was shared across architectures. x86 has IOMEM, so drop it. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3cffa2cac70b..5d450cc6f841 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3814,7 +3814,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (r) goto out; } -#if CONFIG_HAS_IOMEM if (vcpu->mmio_needed) { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; @@ -3832,7 +3831,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } } -#endif if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); -- cgit v1.2.2 From ffde22ac53b6d6b1d7206f1172176a667eead778 Mon Sep 17 00:00:00 2001 From: Ed Swierk Date: Thu, 15 Oct 2009 15:21:43 -0700 Subject: KVM: Xen PV-on-HVM guest support Support for Xen PV-on-HVM guests can be implemented almost entirely in userspace, except for handling one annoying MSR that maps a Xen hypercall blob into guest address space. A generic mechanism to delegate MSR writes to userspace seems overkill and risks encouraging similar MSR abuse in the future. Thus this patch adds special support for the Xen HVM MSR. I implemented a new ioctl, KVM_XEN_HVM_CONFIG, that lets userspace tell KVM which MSR the guest will write to, as well as the starting address and size of the hypercall blobs (one each for 32-bit and 64-bit) that userspace has loaded from files. When the guest writes to the MSR, KVM copies one page of the blob from userspace to the guest. I've tested this patch with a hacked-up version of Gerd's userspace code, booting a number of guests (CentOS 5.3 i386 and x86_64, and FreeBSD 8.0-RC1 amd64) and exercising PV network and block devices. [jan: fix i386 build warning] [avi: future proof abi with a flags field] Signed-off-by: Ed Swierk Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 1 + arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 46 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index f02e87a5206f..ef9b4b73cce4 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -19,6 +19,7 @@ #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 +#define __KVM_HAVE_XEN_HVM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 179a919f53a4..36f3b53f5c27 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -410,6 +410,8 @@ struct kvm_arch{ unsigned long irq_sources_bitmap; u64 vm_init_tsc; + + struct kvm_xen_hvm_config xen_hvm_config; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5d450cc6f841..bb842db3ee7c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -857,6 +857,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } +static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + int lm = is_long_mode(vcpu); + u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 + : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; + u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 + : kvm->arch.xen_hvm_config.blob_size_32; + u32 page_num = data & ~PAGE_MASK; + u64 page_addr = data & PAGE_MASK; + u8 *page; + int r; + + r = -E2BIG; + if (page_num >= blob_size) + goto out; + r = -ENOMEM; + page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!page) + goto out; + r = -EFAULT; + if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) + goto out_free; + if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) + goto out_free; + r = 0; +out_free: + kfree(page); +out: + return r; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -972,6 +1004,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) "0x%x data 0x%llx\n", msr, data); break; default: + if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) + return xen_hvm_config(vcpu, data); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); @@ -1246,6 +1280,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: + case KVM_CAP_XEN_HVM: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2441,6 +2476,17 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_XEN_HVM_CONFIG: { + r = -EFAULT; + if (copy_from_user(&kvm->arch.xen_hvm_config, argp, + sizeof(struct kvm_xen_hvm_config))) + goto out; + r = -EINVAL; + if (kvm->arch.xen_hvm_config.flags) + goto out; + r = 0; + break; + } default: ; } -- cgit v1.2.2 From 94fe45da48f921d01d8ff02a0ad54ee9c326d7f0 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 18 Oct 2009 13:24:44 +0200 Subject: KVM: x86: Fix guest single-stepping while interruptible Commit 705c5323 opened the doors of hell by unconditionally injecting single-step flags as long as guest_debug signaled this. This doesn't work when the guest branches into some interrupt or exception handler and triggers a vmexit with flag reloading. Fix it by saving cs:rip when user space requests single-stepping and restricting the trace flag injection to this guest code position. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/x86.c | 47 ++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 36f3b53f5c27..2536fbd85b3a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -371,6 +371,10 @@ struct kvm_vcpu_arch { u64 mcg_status; u64 mcg_ctl; u64 *mce_banks; + + /* used for guest single stepping over the given code position */ + u16 singlestep_cs; + unsigned long singlestep_rip; }; struct kvm_mem_alias { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bb842db3ee7c..13f30aac460b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -235,25 +235,6 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) } EXPORT_SYMBOL_GPL(kvm_require_cpl); -unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags; - - rflags = kvm_x86_ops->get_rflags(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); - return rflags; -} -EXPORT_SYMBOL_GPL(kvm_get_rflags); - -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - kvm_x86_ops->set_rflags(vcpu, rflags); -} -EXPORT_SYMBOL_GPL(kvm_set_rflags); - /* * Load the pae pdptrs. Return true is they are all valid. */ @@ -4565,6 +4546,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + vcpu->arch.singlestep_cs = + get_segment_selector(vcpu, VCPU_SREG_CS); + vcpu->arch.singlestep_rip = kvm_rip_read(vcpu); + } + /* * Trigger an rflags update that will inject or remove the trace * flags. @@ -5031,6 +5018,28 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } +unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags; + + rflags = kvm_x86_ops->get_rflags(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); + return rflags; +} +EXPORT_SYMBOL_GPL(kvm_get_rflags); + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + vcpu->arch.singlestep_cs == + get_segment_selector(vcpu, VCPU_SREG_CS) && + vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) + rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + kvm_x86_ops->set_rflags(vcpu, rflags); +} +EXPORT_SYMBOL_GPL(kvm_set_rflags); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); -- cgit v1.2.2 From 6be7d3062b59af891be7e40c6802350de5f78cef Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 18 Oct 2009 13:24:54 +0200 Subject: KVM: SVM: Cleanup NMI singlestep Push the NMI-related singlestep variable into vcpu_svm. It's dealing with an AMD-specific deficit, nothing generic for x86. Acked-by: Gleb Natapov Signed-off-by: Jan Kiszka arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm.c | 12 +++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm.c | 12 +++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2536fbd85b3a..4d994ad5051a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -354,7 +354,6 @@ struct kvm_vcpu_arch { unsigned int time_offset; struct page *time_page; - bool singlestep; /* guest is single stepped by KVM */ bool nmi_pending; bool nmi_injected; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 170b2d9c6909..ffa6ad216b71 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -107,6 +107,8 @@ struct vcpu_svm { u32 *msrpm; struct nested_state nested; + + bool nmi_singlestep; }; /* enable NPT for AMD64 and X86 with PAE */ @@ -1050,7 +1052,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu) svm->vmcb->control.intercept_exceptions &= ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); - if (vcpu->arch.singlestep) + if (svm->nmi_singlestep) svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { @@ -1195,13 +1197,13 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && - !svm->vcpu.arch.singlestep) { + !svm->nmi_singlestep) { kvm_queue_exception(&svm->vcpu, DB_VECTOR); return 1; } - if (svm->vcpu.arch.singlestep) { - svm->vcpu.arch.singlestep = false; + if (svm->nmi_singlestep) { + svm->nmi_singlestep = false; if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); @@ -2543,7 +2545,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) /* Something prevents NMI from been injected. Single step over possible problem (IRET or exception injection or interrupt shadow) */ - vcpu->arch.singlestep = true; + svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_intercept(vcpu); } -- cgit v1.2.2 From afbcf7ab8d1bc8c2d04792f6d9e786e0adeb328d Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 16 Oct 2009 15:28:36 -0400 Subject: KVM: allow userspace to adjust kvmclock offset When we migrate a kvm guest that uses pvclock between two hosts, we may suffer a large skew. This is because there can be significant differences between the monotonic clock of the hosts involved. When a new host with a much larger monotonic time starts running the guest, the view of time will be significantly impacted. Situation is much worse when we do the opposite, and migrate to a host with a smaller monotonic clock. This proposed ioctl will allow userspace to inform us what is the monotonic clock value in the source host, so we can keep the time skew short, and more importantly, never goes backwards. Userspace may also need to trigger the current data, since from the first migration onwards, it won't be reflected by a simple call to clock_gettime() anymore. [marcelo: future-proof abi with a flags field] [jan: fix KVM_GET_CLOCK by clearing flags field instead of checking it] Signed-off-by: Glauber Costa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 42 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4d994ad5051a..0558ff8c32ae 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -413,6 +413,7 @@ struct kvm_arch{ unsigned long irq_sources_bitmap; u64 vm_init_tsc; + s64 kvmclock_offset; struct kvm_xen_hvm_config xen_hvm_config; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 13f30aac460b..e16cdc9ec0c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -680,7 +680,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ vcpu->hv_clock.system_time = ts.tv_nsec + - (NSEC_PER_SEC * (u64)ts.tv_sec); + (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -1262,6 +1263,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_XEN_HVM: + case KVM_CAP_ADJUST_CLOCK: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2468,6 +2470,44 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_SET_CLOCK: { + struct timespec now; + struct kvm_clock_data user_ns; + u64 now_ns; + s64 delta; + + r = -EFAULT; + if (copy_from_user(&user_ns, argp, sizeof(user_ns))) + goto out; + + r = -EINVAL; + if (user_ns.flags) + goto out; + + r = 0; + ktime_get_ts(&now); + now_ns = timespec_to_ns(&now); + delta = user_ns.clock - now_ns; + kvm->arch.kvmclock_offset = delta; + break; + } + case KVM_GET_CLOCK: { + struct timespec now; + struct kvm_clock_data user_ns; + u64 now_ns; + + ktime_get_ts(&now); + now_ns = timespec_to_ns(&now); + user_ns.clock = kvm->arch.kvmclock_offset + now_ns; + user_ns.flags = 0; + + r = -EFAULT; + if (copy_to_user(argp, &user_ns, sizeof(user_ns))) + goto out; + r = 0; + break; + } + default: ; } -- cgit v1.2.2 From fa40052ca04bdbbeb20b839cc8ffe9fa7beefbe9 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 24 Oct 2009 02:49:58 -0200 Subject: KVM: VMX: Use macros instead of hex value on cr0 initialization This should have no effect, it is just to make the code clearer. Signed-off-by: Eduardo Habkost Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 364263a25ff8..17730175aa08 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2538,7 +2538,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx->vcpu.arch.cr0 = 0x60000010; + vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); -- cgit v1.2.2 From 18fa000ae453767b59ab97477925895a3f0c46ea Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 24 Oct 2009 02:49:59 -0200 Subject: KVM: SVM: Reset cr0 properly on vcpu reset svm_vcpu_reset() was not properly resetting the contents of the guest-visible cr0 register, causing the following issue: https://bugzilla.redhat.com/show_bug.cgi?id=525699 Without resetting cr0 properly, the vcpu was running the SIPI bootstrap routine with paging enabled, making the vcpu get a pagefault exception while trying to run it. Instead of setting vmcb->save.cr0 directly, the new code just resets kvm->arch.cr0 and calls kvm_set_cr0(). The bits that were set/cleared on vmcb->save.cr0 (PG, WP, !CD, !NW) will be set properly by svm_set_cr0(). kvm_set_cr0() is used instead of calling svm_set_cr0() directly to make sure kvm_mmu_reset_context() is called to reset the mmu to nonpaging mode. Signed-off-by: Eduardo Habkost Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ffa6ad216b71..c9ef6c0e1e98 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -628,11 +628,12 @@ static void init_vmcb(struct vcpu_svm *svm) save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; - /* - * cr0 val on cpu init should be 0x60000010, we enable cpu - * cache by default. the orderly way is to enable cache in bios. + /* This is the guest-visible cr0 value. + * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ - save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; + svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + save->cr4 = X86_CR4_PAE; /* rdx = ?? */ -- cgit v1.2.2 From 3ce672d48400e0112fec7a3cb6bb2120493c6e11 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 24 Oct 2009 02:50:00 -0200 Subject: KVM: SVM: init_vmcb(): remove redundant save->cr0 initialization The svm_set_cr0() call will initialize save->cr0 properly even when npt is enabled, clearing the NW and CD bits as expected, so we don't need to initialize it manually for npt_enabled anymore. Signed-off-by: Eduardo Habkost Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c9ef6c0e1e98..34b700f9e498 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -648,8 +648,6 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| INTERCEPT_CR3_MASK); save->g_pat = 0x0007040600070406ULL; - /* enable caching because the QEMU Bios doesn't enable it */ - save->cr0 = X86_CR0_ET; save->cr3 = 0; save->cr4 = 0; } -- cgit v1.2.2 From 44ea2b1758d88ad822e65b1c4c21ca6164494e27 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Sep 2009 15:55:37 +0300 Subject: KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area Currently MSR_KERNEL_GS_BASE is saved and restored as part of the guest/host msr reloading. Since we wish to lazy-restore all the other msrs, save and reload MSR_KERNEL_GS_BASE explicitly instead of using the common code. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 17730175aa08..32512519e1ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -99,7 +99,8 @@ struct vcpu_vmx { int save_nmsrs; int msr_offset_efer; #ifdef CONFIG_X86_64 - int msr_offset_kernel_gs_base; + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; #endif struct vmcs *vmcs; struct { @@ -202,7 +203,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu); */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_K6_STAR, }; @@ -674,10 +675,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - save_msrs(vmx->host_msrs + - vmx->msr_offset_kernel_gs_base, 1); - + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + } #endif load_msrs(vmx->guest_msrs, vmx->save_nmsrs); load_transition_efer(vmx); @@ -711,6 +712,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) save_msrs(vmx->guest_msrs, vmx->save_nmsrs); load_msrs(vmx->host_msrs, vmx->save_nmsrs); reload_host_efer(vmx); +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + } +#endif } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -938,9 +945,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_CSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); /* @@ -954,10 +958,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) #endif vmx->save_nmsrs = save_nmsrs; -#ifdef CONFIG_X86_64 - vmx->msr_offset_kernel_gs_base = - __find_msr_index(vmx, MSR_KERNEL_GS_BASE); -#endif vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); if (cpu_has_vmx_msr_bitmap()) { @@ -1015,6 +1015,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_GS_BASE: data = vmcs_readl(GUEST_GS_BASE); break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(to_vmx(vcpu)); + data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + break; case MSR_EFER: return kvm_get_msr_common(vcpu, msr_index, pdata); #endif @@ -1068,6 +1072,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_GS_BASE: vmcs_writel(GUEST_GS_BASE, data); break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(vmx); + vmx->msr_guest_kernel_gs_base = data; + break; #endif case MSR_IA32_SYSENTER_CS: vmcs_write32(GUEST_SYSENTER_CS, data); @@ -1559,6 +1567,11 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + /* + * Force kernel_gs_base reloading before EFER changes, as control + * of this msr depends on is_long_mode(). + */ + vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.shadow_efer = efer; if (!msr) return; -- cgit v1.2.2 From 18863bdd60f895f3b3ba16b15e8331aee781e8ec Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Sep 2009 11:12:18 +0300 Subject: KVM: x86 shared msr infrastructure The various syscall-related MSRs are fairly expensive to switch. Currently we switch them on every vcpu preemption, which is far too often: - if we're switching to a kernel thread (idle task, threaded interrupt, kernel-mode virtio server (vhost-net), for example) and back, then there's no need to switch those MSRs since kernel threasd won't be exiting to userspace. - if we're switching to another guest running an identical OS, most likely those MSRs will have the same value, so there's little point in reloading them. - if we're running the same OS on the guest and host, the MSRs will have identical values and reloading is unnecessary. This patch uses the new user return notifiers to implement last-minute switching, and checks the msr values to avoid unnecessary reloading. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/x86.c | 81 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0558ff8c32ae..26a74b7bb6bc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -809,4 +809,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); +void kvm_define_shared_msr(unsigned index, u32 msr); +void kvm_set_shared_msr(unsigned index, u64 val); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b84e571f4175..4cd498332466 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE + select USER_RETURN_NOTIFIER ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e16cdc9ec0c1..58c5cddf363d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -87,6 +88,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +#define KVM_NR_SHARED_MSRS 16 + +struct kvm_shared_msrs_global { + int nr; + struct kvm_shared_msr { + u32 msr; + u64 value; + } msrs[KVM_NR_SHARED_MSRS]; +}; + +struct kvm_shared_msrs { + struct user_return_notifier urn; + bool registered; + u64 current_value[KVM_NR_SHARED_MSRS]; +}; + +static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; +static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -123,6 +143,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +static void kvm_on_user_return(struct user_return_notifier *urn) +{ + unsigned slot; + struct kvm_shared_msr *global; + struct kvm_shared_msrs *locals + = container_of(urn, struct kvm_shared_msrs, urn); + + for (slot = 0; slot < shared_msrs_global.nr; ++slot) { + global = &shared_msrs_global.msrs[slot]; + if (global->value != locals->current_value[slot]) { + wrmsrl(global->msr, global->value); + locals->current_value[slot] = global->value; + } + } + locals->registered = false; + user_return_notifier_unregister(urn); +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ + int cpu; + u64 value; + + if (slot >= shared_msrs_global.nr) + shared_msrs_global.nr = slot + 1; + shared_msrs_global.msrs[slot].msr = msr; + rdmsrl_safe(msr, &value); + shared_msrs_global.msrs[slot].value = value; + for_each_online_cpu(cpu) + per_cpu(shared_msrs, cpu).current_value[slot] = value; +} +EXPORT_SYMBOL_GPL(kvm_define_shared_msr); + +static void kvm_shared_msr_cpu_online(void) +{ + unsigned i; + struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); + + for (i = 0; i < shared_msrs_global.nr; ++i) + locals->current_value[i] = shared_msrs_global.msrs[i].value; +} + +void kvm_set_shared_msr(unsigned slot, u64 value) +{ + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + + if (value == smsr->current_value[slot]) + return; + smsr->current_value[slot] = value; + wrmsrl(shared_msrs_global.msrs[slot].msr, value); + if (!smsr->registered) { + smsr->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&smsr->urn); + smsr->registered = true; + } +} +EXPORT_SYMBOL_GPL(kvm_set_shared_msr); + unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -4815,6 +4893,9 @@ int kvm_arch_hardware_enable(void *garbage) int cpu = raw_smp_processor_id(); per_cpu(cpu_tsc_khz, cpu) = 0; } + + kvm_shared_msr_cpu_online(); + return kvm_x86_ops->hardware_enable(garbage); } -- cgit v1.2.2 From 26bb0981b3ff00b9177d61fe55806db978862b3c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Sep 2009 11:14:12 +0300 Subject: KVM: VMX: Use shared msr infrastructure Instead of reloading syscall MSRs on every preemption, use the new shared msr infrastructure to reload them at the last possible minute (just before exit to userspace). Improves vcpu/idle/vcpu switches by about 2000 cycles (when EFER needs to be reloaded as well). [jan: fix slot index missing indirection] Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 112 ++++++++++++++++++++--------------------------------- 1 file changed, 42 insertions(+), 70 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32512519e1ac..bf46253149c3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -86,6 +86,11 @@ struct vmcs { char data[0]; }; +struct shared_msr_entry { + unsigned index; + u64 data; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; @@ -93,8 +98,7 @@ struct vcpu_vmx { int launched; u8 fail; u32 idt_vectoring_info; - struct kvm_msr_entry *guest_msrs; - struct kvm_msr_entry *host_msrs; + struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; int msr_offset_efer; @@ -108,7 +112,6 @@ struct vcpu_vmx { u16 fs_sel, gs_sel, ldt_sel; int gs_ldt_reload_needed; int fs_reload_needed; - int guest_efer_loaded; } host_state; struct { int vm86_active; @@ -195,6 +198,8 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static u64 host_efer; + static void ept_save_pdptrs(struct kvm_vcpu *vcpu); /* @@ -209,22 +214,6 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -static void load_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - wrmsrl(e[i].index, e[i].data); -} - -static void save_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - rdmsrl(e[i].index, e[i].data); -} - static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -373,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) int i; for (i = 0; i < vmx->nmsrs; ++i) - if (vmx->guest_msrs[i].index == msr) + if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) return i; return -1; } @@ -404,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) : : "a" (&operand), "c" (ext) : "cc", "memory"); } -static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -595,17 +584,15 @@ static void reload_tss(void) load_TR_desc(); } -static void load_transition_efer(struct vcpu_vmx *vmx) +static bool update_transition_efer(struct vcpu_vmx *vmx) { int efer_offset = vmx->msr_offset_efer; - u64 host_efer; u64 guest_efer; u64 ignore_bits; if (efer_offset < 0) - return; - host_efer = vmx->host_msrs[efer_offset].data; - guest_efer = vmx->guest_msrs[efer_offset].data; + return false; + guest_efer = vmx->vcpu.arch.shadow_efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -619,26 +606,18 @@ static void load_transition_efer(struct vcpu_vmx *vmx) ignore_bits &= ~(u64)EFER_SCE; #endif if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) - return; + return false; - vmx->host_state.guest_efer_loaded = 1; guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; - wrmsrl(MSR_EFER, guest_efer); - vmx->vcpu.stat.efer_reload++; -} - -static void reload_host_efer(struct vcpu_vmx *vmx) -{ - if (vmx->host_state.guest_efer_loaded) { - vmx->host_state.guest_efer_loaded = 0; - load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); - } + vmx->guest_msrs[efer_offset].data = guest_efer; + return true; } static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; if (vmx->host_state.loaded) return; @@ -680,8 +659,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); } #endif - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_transition_efer(vmx); + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data); } static void __vmx_load_host_state(struct vcpu_vmx *vmx) @@ -709,9 +689,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) local_irq_restore(flags); } reload_tss(); - save_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_msrs(vmx->host_msrs, vmx->save_nmsrs); - reload_host_efer(vmx); #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -908,19 +885,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, /* * Swap MSR entry in host/guest MSR entry array. */ -#ifdef CONFIG_X86_64 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) { - struct kvm_msr_entry tmp; + struct shared_msr_entry tmp; tmp = vmx->guest_msrs[to]; vmx->guest_msrs[to] = vmx->guest_msrs[from]; vmx->guest_msrs[from] = tmp; - tmp = vmx->host_msrs[to]; - vmx->host_msrs[to] = vmx->host_msrs[from]; - vmx->host_msrs[from] = tmp; } -#endif /* * Set up the vmcs to automatically save and restore system @@ -929,15 +901,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs; + int save_nmsrs, index; unsigned long *msr_bitmap; vmx_load_host_state(vmx); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { - int index; - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); @@ -956,9 +926,11 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); } #endif - vmx->save_nmsrs = save_nmsrs; + vmx->msr_offset_efer = index = __find_msr_index(vmx, MSR_EFER); + if (index >= 0 && update_transition_efer(vmx)) + move_msr_up(vmx, index, save_nmsrs++); - vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); + vmx->save_nmsrs = save_nmsrs; if (cpu_has_vmx_msr_bitmap()) { if (is_long_mode(&vmx->vcpu)) @@ -1000,7 +972,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { u64 data; - struct kvm_msr_entry *msr; + struct shared_msr_entry *msr; if (!pdata) { printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); @@ -1019,9 +991,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) vmx_load_host_state(to_vmx(vcpu)); data = to_vmx(vcpu)->msr_guest_kernel_gs_base; break; +#endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_index, pdata); -#endif case MSR_IA32_TSC: data = guest_read_tsc(); break; @@ -1035,6 +1007,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: + vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { vmx_load_host_state(to_vmx(vcpu)); @@ -1056,7 +1029,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr; + struct shared_msr_entry *msr; u64 host_tsc; int ret = 0; @@ -1565,7 +1538,10 @@ continue_rmode: static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + if (!msr) + return; /* * Force kernel_gs_base reloading before EFER changes, as control @@ -2417,10 +2393,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (wrmsr_safe(index, data_low, data_high) < 0) continue; data = data_low | ((u64)data_high << 32); - vmx->host_msrs[j].index = index; - vmx->host_msrs[j].reserved = 0; - vmx->host_msrs[j].data = data; - vmx->guest_msrs[j] = vmx->host_msrs[j]; + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; ++vmx->nmsrs; } @@ -3821,7 +3795,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) __clear_bit(vmx->vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); vmx_free_vmcs(vcpu); - kfree(vmx->host_msrs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -3848,10 +3821,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit_vcpu; } - vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vmx->host_msrs) - goto free_guest_msrs; - vmx->vmcs = alloc_vmcs(); if (!vmx->vmcs) goto free_msrs; @@ -3882,8 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) free_vmcs: free_vmcs(vmx->vmcs); free_msrs: - kfree(vmx->host_msrs); -free_guest_msrs: kfree(vmx->guest_msrs); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); @@ -4033,7 +4000,12 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r; + int r, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < NR_VMX_MSR; ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) -- cgit v1.2.2 From 1655e3a3dc16e21b60d9950e201b38a9894f1bcf Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 25 Oct 2009 17:45:07 +0200 Subject: KVM: remove duplicated task_switch check Probably introduced by a bad merge. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 58c5cddf363d..dbddcc2d2c97 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4525,11 +4525,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); } - /* set back link to prev task only if NT bit is set in eflags - note that old_tss_sel is not used afetr this point */ - if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) - old_tss_sel = 0xffff; - /* set back link to prev task only if NT bit is set in eflags note that old_tss_sel is not used afetr this point */ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) -- cgit v1.2.2 From 7c93be44a4790b0fd9dddf29c5503cf86c105304 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 26 Oct 2009 16:48:33 -0200 Subject: KVM: VMX: move CR3/PDPTR update to vmx_set_cr3 GUEST_CR3 is updated via kvm_set_cr3 whenever CR3 is modified from outside guest context. Similarly pdptrs are updated via load_pdptrs. Let kvm_set_cr3 perform the update, removing it from the vcpu_run fast path. Signed-off-by: Marcelo Tosatti Acked-by: Acked-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 5 +---- arch/x86/kvm/x86.c | 4 +++- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bf46253149c3..a5f3f3ec69e6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1737,6 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmcs_write64(EPT_POINTER, eptp); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : vcpu->kvm->arch.ept_identity_map_addr; + ept_load_pdptrs(vcpu); } vmx_flush_tlb(vcpu); @@ -3625,10 +3626,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (enable_ept && is_paging(vcpu)) { - vmcs_writel(GUEST_CR3, vcpu->arch.cr3); - ept_load_pdptrs(vcpu); - } /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dbddcc2d2c97..719f31eecd3d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4591,8 +4591,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (!is_long_mode(vcpu) && is_pae(vcpu)) + if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.cr3); + mmu_reset_needed = 1; + } if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); -- cgit v1.2.2 From 5f5c35aad5ccaa8f1bd5d9e12f9f5251f3180093 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 26 Oct 2009 16:50:14 -0200 Subject: KVM: MMU: update invlpg handler comment Large page translations are always synchronized (either in level 3 or level 2), so its not necessary to properly deal with them in the invlpg handler. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 72558f8ff3f5..a6017132fba8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -467,7 +467,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) level = iterator.level; sptep = iterator.sptep; - /* FIXME: properly handle invlpg on large guest pages */ if (level == PT_PAGE_TABLE_LEVEL || ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { -- cgit v1.2.2 From 92c0d900159a4fa582e1c8ebcc1c4a8020defff5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Oct 2009 11:00:16 +0200 Subject: KVM: VMX: Remove vmx->msr_offset_efer This variable is used to communicate between a caller and a callee; switch to a function argument instead. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a5f3f3ec69e6..c9cc9596e1a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -101,7 +101,6 @@ struct vcpu_vmx { struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; - int msr_offset_efer; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; @@ -584,14 +583,11 @@ static void reload_tss(void) load_TR_desc(); } -static bool update_transition_efer(struct vcpu_vmx *vmx) +static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) { - int efer_offset = vmx->msr_offset_efer; u64 guest_efer; u64 ignore_bits; - if (efer_offset < 0) - return false; guest_efer = vmx->vcpu.arch.shadow_efer; /* @@ -926,8 +922,8 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); } #endif - vmx->msr_offset_efer = index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx)) + index = __find_msr_index(vmx, MSR_EFER); + if (index >= 0 && update_transition_efer(vmx, index)) move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; -- cgit v1.2.2 From 3ddea128ad75bd33e88780fe44f44c3717369b98 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 29 Oct 2009 13:44:15 -0200 Subject: KVM: x86: disallow multiple KVM_CREATE_IRQCHIP Otherwise kvm will leak memory on multiple KVM_CREATE_IRQCHIP. Also serialize multiple accesses with kvm->lock. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/irq.h | 6 +++++- arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index c025a2362aae..be399e207d57 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -86,7 +86,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) static inline int irqchip_in_kernel(struct kvm *kvm) { - return pic_irqchip(kvm) != NULL; + int ret; + + ret = (pic_irqchip(kvm) != NULL); + smp_rmb(); + return ret; } void kvm_pic_reset(struct kvm_kpic_state *s); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 719f31eecd3d..97f6f9565ac9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2362,25 +2362,39 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r) goto out; break; - case KVM_CREATE_IRQCHIP: + case KVM_CREATE_IRQCHIP: { + struct kvm_pic *vpic; + + mutex_lock(&kvm->lock); + r = -EEXIST; + if (kvm->arch.vpic) + goto create_irqchip_unlock; r = -ENOMEM; - kvm->arch.vpic = kvm_create_pic(kvm); - if (kvm->arch.vpic) { + vpic = kvm_create_pic(kvm); + if (vpic) { r = kvm_ioapic_init(kvm); if (r) { - kfree(kvm->arch.vpic); - kvm->arch.vpic = NULL; - goto out; + kfree(vpic); + goto create_irqchip_unlock; } } else - goto out; + goto create_irqchip_unlock; + smp_wmb(); + kvm->arch.vpic = vpic; + smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { + mutex_lock(&kvm->irq_lock); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); - goto out; + kvm->arch.vpic = NULL; + kvm->arch.vioapic = NULL; + mutex_unlock(&kvm->irq_lock); } + create_irqchip_unlock: + mutex_unlock(&kvm->lock); break; + } case KVM_CREATE_PIT: u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; goto create_pit; -- cgit v1.2.2 From 2204ae3c96e9a1fed50f7ee19ce092e69d7dfe82 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 29 Oct 2009 13:44:16 -0200 Subject: KVM: x86: disallow KVM_{SET,GET}_LAPIC without allocated in-kernel lapic Otherwise kvm might attempt to dereference a NULL pointer. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 97f6f9565ac9..cd6fe0a5797f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1893,6 +1893,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, switch (ioctl) { case KVM_GET_LAPIC: { + r = -EINVAL; + if (!vcpu->arch.apic) + goto out; lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; @@ -1908,6 +1911,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_LAPIC: { + r = -EINVAL; + if (!vcpu->arch.apic) + goto out; lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; if (!lapic) -- cgit v1.2.2 From 4f926bf291863c237188bd2e27222ed801f12094 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 30 Oct 2009 12:46:59 +0100 Subject: KVM: x86: Polish exception injection via KVM_SET_GUEST_DEBUG Decouple KVM_GUESTDBG_INJECT_DB and KVM_GUESTDBG_INJECT_BP from KVM_GUESTDBG_ENABLE, their are actually orthogonal. At this chance, avoid triggering the WARN_ON in kvm_queue_exception if there is already an exception pending and reject such invalid requests. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd6fe0a5797f..ba8958dca3c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4656,10 +4656,20 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { unsigned long rflags; - int i; + int i, r; vcpu_load(vcpu); + if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { + r = -EBUSY; + if (vcpu->arch.exception.pending) + goto unlock_out; + if (dbg->control & KVM_GUESTDBG_INJECT_DB) + kvm_queue_exception(vcpu, DB_VECTOR); + else + kvm_queue_exception(vcpu, BP_VECTOR); + } + /* * Read rflags as long as potentially injected trace flags are still * filtered out. @@ -4695,14 +4705,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, kvm_x86_ops->set_guest_debug(vcpu, dbg); - if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_DB) - kvm_queue_exception(vcpu, DB_VECTOR); - else if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_BP) - kvm_queue_exception(vcpu, BP_VECTOR); + r = 0; +unlock_out: vcpu_put(vcpu); - return 0; + return r; } /* -- cgit v1.2.2 From a9c7399d6cda0a092b347f8ee49bbe44f6e1fe66 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 4 Nov 2009 11:54:59 +0200 Subject: KVM: Allow internal errors reported to userspace to carry extra data Usually userspace will freeze the guest so we can inspect it, but some internal state is not available. Add extra data to internal error reporting so we can expose it to the debugger. Extra data is specific to the suberror. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/vmx.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a9024797b21f..4c3e5b2314cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2800,6 +2800,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) case EMULATE_FAIL: vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; default: BUG(); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c9cc9596e1a6..c0e66dd58a47 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3352,6 +3352,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) kvm_report_emulation_failure(vcpu, "emulation failure"); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; ret = 0; goto out; } -- cgit v1.2.2 From 65ac7264043740572ba804edca03c374d70427c9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 4 Nov 2009 11:59:01 +0200 Subject: KVM: VMX: Report unexpected simultaneous exceptions as internal errors These happen when we trap an exception when another exception is being delivered; we only expect these with MCEs and page faults. If something unexpected happens, things probably went south and we're better off reporting an internal error and freezing. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c0e66dd58a47..22fcd27a0b58 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2744,9 +2744,14 @@ static int handle_exception(struct kvm_vcpu *vcpu) return handle_machine_check(vcpu); if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) - printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " - "intr info 0x%x\n", __func__, vect_info, intr_info); + !is_page_fault(intr_info)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ -- cgit v1.2.2 From 3cfc3092f40bc37c57ba556cfd8de4218f2135ab Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 12 Nov 2009 01:04:25 +0100 Subject: KVM: x86: Add KVM_GET/SET_VCPU_EVENTS This new IOCTL exports all yet user-invisible states related to exceptions, interrupts, and NMIs. Together with appropriate user space changes, this fixes sporadic problems of vmsave/restore, live migration and system reset. [avi: future-proof abi by adding a flags field] Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 28 +++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 22 ++++++++++++ arch/x86/kvm/vmx.c | 30 ++++++++++++++++ arch/x86/kvm/x86.c | 77 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 159 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index ef9b4b73cce4..950df434763f 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -20,6 +20,7 @@ #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 #define __KVM_HAVE_XEN_HVM +#define __KVM_HAVE_VCPU_EVENTS /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -252,4 +253,31 @@ struct kvm_reinject_control { __u8 pit_reinject; __u8 reserved[31]; }; + +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pad; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 pad; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + __u32 reserved[10]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 26a74b7bb6bc..06e085614dad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -523,6 +523,8 @@ struct kvm_x86_ops { bool has_error_code, u32 error_code); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); + bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); + void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 34b700f9e498..3de0b37ec038 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2499,6 +2499,26 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu) !(svm->vcpu.arch.hflags & HF_NMI_MASK); } +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); +} + +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (masked) { + svm->vcpu.arch.hflags |= HF_NMI_MASK; + svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + } else { + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + } +} + static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2946,6 +2966,8 @@ static struct kvm_x86_ops svm_x86_ops = { .queue_exception = svm_queue_exception, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, + .get_nmi_mask = svm_get_nmi_mask, + .set_nmi_mask = svm_set_nmi_mask, .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 22fcd27a0b58..778f059ae423 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2639,6 +2639,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + if (!cpu_has_virtual_nmis()) + return to_vmx(vcpu)->soft_vnmi_blocked; + else + return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + GUEST_INTR_STATE_NMI); +} + +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + if (vmx->soft_vnmi_blocked != masked) { + vmx->soft_vnmi_blocked = masked; + vmx->vnmi_blocked_time = 0; + } + } else { + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } +} + static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && @@ -3985,6 +4013,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .queue_exception = vmx_queue_exception, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ba8958dca3c4..35eea30821d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1342,6 +1342,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_XEN_HVM: case KVM_CAP_ADJUST_CLOCK: + case KVM_CAP_VCPU_EVENTS: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1883,6 +1884,61 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; } +static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_load(vcpu); + + events->exception.injected = vcpu->arch.exception.pending; + events->exception.nr = vcpu->arch.exception.nr; + events->exception.has_error_code = vcpu->arch.exception.has_error_code; + events->exception.error_code = vcpu->arch.exception.error_code; + + events->interrupt.injected = vcpu->arch.interrupt.pending; + events->interrupt.nr = vcpu->arch.interrupt.nr; + events->interrupt.soft = vcpu->arch.interrupt.soft; + + events->nmi.injected = vcpu->arch.nmi_injected; + events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + + events->sipi_vector = vcpu->arch.sipi_vector; + + events->flags = 0; + + vcpu_put(vcpu); +} + +static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + if (events->flags) + return -EINVAL; + + vcpu_load(vcpu); + + vcpu->arch.exception.pending = events->exception.injected; + vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.has_error_code = events->exception.has_error_code; + vcpu->arch.exception.error_code = events->exception.error_code; + + vcpu->arch.interrupt.pending = events->interrupt.injected; + vcpu->arch.interrupt.nr = events->interrupt.nr; + vcpu->arch.interrupt.soft = events->interrupt.soft; + if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + + vcpu->arch.nmi_injected = events->nmi.injected; + vcpu->arch.nmi_pending = events->nmi.pending; + kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + + vcpu->arch.sipi_vector = events->sipi_vector; + + vcpu_put(vcpu); + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2040,6 +2096,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); break; } + case KVM_GET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); + + r = -EFAULT; + if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) + break; + r = 0; + break; + } + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + r = -EFAULT; + if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) + break; + + r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + break; + } default: r = -EINVAL; } -- cgit v1.2.2 From eb3c79e64a70fb8f7473e30fa07e89c1ecc2c9bb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 24 Nov 2009 15:20:15 +0200 Subject: KVM: x86 emulator: limit instructions to 15 bytes While we are never normally passed an instruction that exceeds 15 bytes, smp games can cause us to attempt to interpret one, which will cause large latencies in non-preempt hosts. Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/kvm/emulate.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b7ed2c423116..7c18e1230f54 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -129,7 +129,7 @@ struct decode_cache { u8 seg_override; unsigned int d; unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; + unsigned long eip, eip_orig; /* modrm */ u8 modrm; u8 modrm_mod; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d226dff47d77..7e8faea4651e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -622,6 +622,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, { int rc = 0; + /* x86 instructions are limited to 15 bytes. */ + if (eip + size - ctxt->decode.eip_orig > 15) + return X86EMUL_UNHANDLEABLE; eip += ctxt->cs_base; while (size--) { rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); @@ -880,7 +883,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = kvm_rip_read(ctxt->vcpu); + c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); -- cgit v1.2.2 From 046d87103addc117f0d397196e85189722d4d7de Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 27 Nov 2009 16:46:26 +0800 Subject: KVM: VMX: Disable unrestricted guest when EPT disabled Otherwise would cause VMEntry failure when using ept=0 on unrestricted guest supported processors. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 778f059ae423..ed97c6c7e648 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2309,8 +2309,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) + if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) -- cgit v1.2.2 From 3548bab501887a698a887639b54d5ecaf35c387b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 28 Nov 2009 14:18:47 +0200 Subject: KVM: Drop user return notifier when disabling virtualization on a cpu This way, we don't leave a dangling notifier on cpu hotunplug or module unload. In particular, module unload leaves the notifier pointing into freed memory. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35eea30821d6..106f9f1f78c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -201,6 +201,14 @@ void kvm_set_shared_msr(unsigned slot, u64 value) } EXPORT_SYMBOL_GPL(kvm_set_shared_msr); +static void drop_user_return_notifiers(void *ignore) +{ + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + + if (smsr->registered) + kvm_on_user_return(&smsr->urn); +} + unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -5004,6 +5012,7 @@ int kvm_arch_hardware_enable(void *garbage) void kvm_arch_hardware_disable(void *garbage) { kvm_x86_ops->hardware_disable(garbage); + drop_user_return_notifiers(garbage); } int kvm_arch_hardware_setup(void) -- cgit v1.2.2 From d5696725b2a4c59503f5e0bc33adeee7f30cd45b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 Dec 2009 12:28:47 +0200 Subject: KVM: VMX: Fix comparison of guest efer with stale host value update_transition_efer() masks out some efer bits when deciding whether to switch the msr during guest entry; for example, NX is emulated using the mmu so we don't need to disable it, and LMA/LME are handled by the hardware. However, with shared msrs, the comparison is made against a stale value; at the time of the guest switch we may be running with another guest's efer. Fix by deferring the mask/compare to the actual point of guest entry. Noted by Marcelo. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx.c | 9 +++++---- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 06e085614dad..4f865e8b8540 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -812,6 +812,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_define_shared_msr(unsigned index, u32 msr); -void kvm_set_shared_msr(unsigned index, u64 val); +void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ed97c6c7e648..d4918d6fc924 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -89,6 +89,7 @@ struct vmcs { struct shared_msr_entry { unsigned index; u64 data; + u64 mask; }; struct vcpu_vmx { @@ -601,12 +602,10 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) if (guest_efer & EFER_LMA) ignore_bits &= ~(u64)EFER_SCE; #endif - if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) - return false; - guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; vmx->guest_msrs[efer_offset].data = guest_efer; + vmx->guest_msrs[efer_offset].mask = ~ignore_bits; return true; } @@ -657,7 +656,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data); + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); } static void __vmx_load_host_state(struct vcpu_vmx *vmx) @@ -2394,6 +2394,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) data = data_low | ((u64)data_high << 32); vmx->guest_msrs[j].index = i; vmx->guest_msrs[j].data = 0; + vmx->guest_msrs[j].mask = -1ull; ++vmx->nmsrs; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 106f9f1f78c0..ce677b20bf86 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -185,11 +185,11 @@ static void kvm_shared_msr_cpu_online(void) locals->current_value[i] = shared_msrs_global.msrs[i].value; } -void kvm_set_shared_msr(unsigned slot, u64 value) +void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - if (value == smsr->current_value[slot]) + if (((value ^ smsr->current_value[slot]) & mask) == 0) return; smsr->current_value[slot] = value; wrmsrl(shared_msrs_global.msrs[slot].msr, value); -- cgit v1.2.2 From 4528752f49c1f4025473d12bc5fa9181085c3f22 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 2 Dec 2009 15:05:56 -0800 Subject: x86, Calgary IOMMU quirk: Find nearest matching Calgary while walking up the PCI tree On a multi-node x3950M2 system, there's a slight oddity in the PCI device tree for all secondary nodes: 30:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev e1) \-33:00.0 PCI bridge: IBM CalIOC2 PCI-E Root Port (rev 01) \-34:00.0 RAID bus controller: LSI Logic / Symbios Logic MegaRAID SAS 1078 (rev 04) ...as compared to the primary node: 00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev e1) \-01:00.0 VGA compatible controller: ATI Technologies Inc ES1000 (rev 02) 03:00.0 PCI bridge: IBM CalIOC2 PCI-E Root Port (rev 01) \-04:00.0 RAID bus controller: LSI Logic / Symbios Logic MegaRAID SAS 1078 (rev 04) In both nodes, the LSI RAID controller hangs off a CalIOC2 device, but on the secondary nodes, the BIOS hides the VGA device and substitutes the device tree ending with the disk controller. It would seem that Calgary devices don't necessarily appear at the top of the PCI tree, which means that the current code to find the Calgary IOMMU that goes with a particular device is buggy. Rather than walk all the way to the top of the PCI device tree and try to match bus number with Calgary descriptor, the code needs to examine each parent of the particular device; if it encounters a Calgary with a matching bus number, simply use that. Otherwise, we BUG() when the bus number of the Calgary doesn't match the bus number of whatever's at the top of the device tree. Extra note: This patch appears to work correctly for the x3950 that came before the x3950 M2. Signed-off-by: Darrick J. Wong Acked-by: Muli Ben-Yehuda Cc: FUJITA Tomonori Cc: Joerg Roedel Cc: Yinghai Lu Cc: Jon D. Mason Cc: Corinna Schultz Cc: LKML-Reference: <20091202230556.GG10295@tux1.beaverton.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 849a0995d970..c563e4c8ff39 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -316,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) pdev = to_pci_dev(dev); + /* search up the device tree for an iommu */ pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; pbus = pbus->parent; - - tbl = pci_iommu(pbus); + } while (pbus); BUG_ON(tbl && (tbl->it_busno != pbus->number)); -- cgit v1.2.2 From 57fea8f7ab67ef42b7f84999e49e47f8717a2d5b Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 3 Dec 2009 19:06:40 +0800 Subject: x86/reboot: Add pci_dev_put in reboot_fixup_32.c for consistency pci_get_device will increase the ref count of found device. Although we're going to reset soon, we should use pci_dev_put to decrease the ref count for consistency. Signed-off-by: Xiaotian Feng Acked-by: H. Peter Anvin Cc: Yinghai Lu LKML-Reference: <1259838400-23833-1-git-send-email-dfeng@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot_fixups_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index 61a837743fe5..201eab63b05f 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -80,6 +80,7 @@ void mach_reboot_fixups(void) continue; cur->reboot_fixup(dev); + pci_dev_put(dev); } } -- cgit v1.2.2 From 7d1849aff6687a135a8da3a75e32a00e3137a5e2 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Thu, 3 Dec 2009 15:52:44 +0100 Subject: x86, apic: Enable lapic nmi watchdog on AMD Family 11h The x86 lapic nmi watchdog does not recognize AMD Family 11h, resulting in: NMI watchdog: CPU not supported As far as I can see from available documentation (the BKDM), family 11h looks identical to family 10h as far as the PMU is concerned. Extending the check to accept family 11h results in: Testing NMI watchdog ... OK. I've been running with this change on a Turion X2 Ultra ZM-82 laptop for a couple of weeks now without problems. Signed-off-by: Mikael Pettersson Cc: Andreas Herrmann Cc: Joerg Roedel Cc: LKML-Reference: <19223.53436.931768.278021@pilspetsen.it.uu.se> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index fab786f60ed6..898df9719afb 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) + boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) return; wd_ops = &k7_wd_ops; break; -- cgit v1.2.2 From be012920ecba161ad20303a3f6d9e96c58cf97c7 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Sat, 21 Nov 2009 08:35:55 +0800 Subject: xen: re-register runstate area earlier on resume. This is necessary to ensure the runstate area is available to xen_sched_clock before any calls to printk which will require it in order to provide a timestamp. I chose to pull the xen_setup_runstate_info out of xen_time_init into the caller in order to maintain parity with calling xen_setup_runstate_info separately from calling xen_time_resume. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel --- arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/time.c | 5 ++--- arch/x86/xen/xen-ops.h | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index dfbf70e65860..cb61f77e4496 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -148,6 +148,8 @@ void xen_vcpu_restore(void) HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) BUG(); + xen_setup_runstate_info(cpu); + xen_vcpu_setup(cpu); if (other_cpu && diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0a5aa44299a5..6bbff94328d2 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -100,7 +100,7 @@ bool xen_vcpu_stolen(int vcpu) return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; } -static void setup_runstate_info(int cpu) +void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; @@ -442,8 +442,6 @@ void xen_setup_timer(int cpu) evt->cpumask = cpumask_of(cpu); evt->irq = irq; - - setup_runstate_info(cpu); } void xen_teardown_timer(int cpu) @@ -494,6 +492,7 @@ __init void xen_time_init(void) setup_force_cpu_cap(X86_FEATURE_TSC); + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 355fa6b99c9c..32529326683d 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -41,6 +41,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); -- cgit v1.2.2 From 3905bb2aa7bb801b31946b37a4635ebac4009051 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 21 Nov 2009 08:46:29 +0800 Subject: xen: restore runstate_info even if !have_vcpu_info_placement Even if have_vcpu_info_placement is not set, we still need to set up the runstate area on each resumed vcpu. Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel --- arch/x86/xen/enlighten.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cb61f77e4496..a7b49f99a130 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -138,26 +138,23 @@ static void xen_vcpu_setup(int cpu) */ void xen_vcpu_restore(void) { - if (have_vcpu_info_placement) { - int cpu; + int cpu; - for_each_online_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); + for_each_online_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) - BUG(); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + BUG(); - xen_setup_runstate_info(cpu); + xen_setup_runstate_info(cpu); + if (have_vcpu_info_placement) xen_vcpu_setup(cpu); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) - BUG(); - } - - BUG_ON(!have_vcpu_info_placement); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + BUG(); } } -- cgit v1.2.2 From fa24ba62ea2869308ffc9f0b286ac9650b4ca6cb Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Sat, 21 Nov 2009 11:32:49 +0000 Subject: xen: correctly restore pfn_to_mfn_list_list after resume pvops kernels >= 2.6.30 can currently only be saved and restored once. The second attempt to save results in: ERROR Internal error: Frame# in pfn-to-mfn frame list is not in pseudophys ERROR Internal error: entry 0: p2m_frame_list[0] is 0xf2c2c2c2, max 0x120000 ERROR Internal error: Failed to map/save the p2m frame list I finally narrowed it down to: commit cdaead6b4e657f960d6d6f9f380e7dfeedc6a09b Author: Jeremy Fitzhardinge Date: Fri Feb 27 15:34:59 2009 -0800 xen: split construction of p2m mfn tables from registration Build the p2m_mfn_list_list early with the rest of the p2m table, but register it later when the real shared_info structure is in place. Signed-off-by: Jeremy Fitzhardinge The unforeseen side-effect of this change was to cause the mfn list list to not be rebuilt on resume. Prior to this change it would have been rebuilt via xen_post_suspend() -> xen_setup_shared_info() -> xen_setup_mfn_list_list(). Fix by explicitly calling xen_build_mfn_list_list() from xen_post_suspend(). Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel --- arch/x86/xen/mmu.c | 2 +- arch/x86/xen/suspend.c | 2 ++ arch/x86/xen/xen-ops.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3bf7b1d250ce..bf4cd6bfe959 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -185,7 +185,7 @@ static inline unsigned p2m_index(unsigned long pfn) } /* Build the parallel p2m_top_mfn structures */ -static void __init xen_build_mfn_list_list(void) +void xen_build_mfn_list_list(void) { unsigned pfn, idx; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 95be7b434724..6343a5d8e93c 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -27,6 +27,8 @@ void xen_pre_suspend(void) void xen_post_suspend(int suspend_cancelled) { + xen_build_mfn_list_list(); + xen_setup_shared_info(); if (suspend_cancelled) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 32529326683d..f9153a300bce 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -25,6 +25,7 @@ extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); +void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); -- cgit v1.2.2 From f350c7922faad3397c98c81a9e5658f5a1ef0214 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 24 Nov 2009 10:16:23 +0000 Subject: xen: register timer interrupt with IRQF_TIMER Otherwise the timer is disabled by dpm_suspend_noirq() which in turn prevents correct operation of stop_machine on multi-processor systems and breaks suspend. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel --- arch/x86/xen/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 6bbff94328d2..9d1f853120d8 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -434,7 +434,7 @@ void xen_setup_timer(int cpu) name = ""; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, name, NULL); evt = &per_cpu(xen_clock_events, cpu); -- cgit v1.2.2 From 028896721ac04f6fa0697f3ecac3f98761746363 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 24 Nov 2009 09:32:48 -0800 Subject: xen: register runstate on secondary CPUs The commit "xen: re-register runstate area earlier on resume" caused us to never try and setup the runstate area for secondary CPUs. Ensure that we do this... Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel --- arch/x86/xen/smp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index fe03eeed7b48..360f8d8c19cd 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -295,6 +295,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); -- cgit v1.2.2 From 499d19b82b586aef18727b9ae1437f8f37b66e91 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 24 Nov 2009 09:38:25 -0800 Subject: xen: register runstate info for boot CPU early printk timestamping uses sched_clock, which in turn relies on runstate info under Xen. So make sure we set it up before any printks can be called. Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel --- arch/x86/xen/enlighten.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a7b49f99a130..79f97383cde3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1181,6 +1181,8 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("about to get started...\n"); + xen_setup_runstate_info(0); + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); -- cgit v1.2.2 From 6aaf5d633bb6cead81b396d861d7bae4b9a0ba7e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Nov 2009 13:15:38 -0800 Subject: xen: use iret for return from 64b kernel to 32b usermode If Xen wants to return to a 32b usermode with sysret it must use the right form. When using VCGF_in_syscall to trigger this, it looks at the code segment and does a 32b sysret if it is FLAT_USER_CS32. However, this is different from __USER32_CS, so it fails to return properly if we use the normal Linux segment. So avoid the whole mess by dropping VCGF_in_syscall and simply use plain iret to return to usermode. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Cc: Stable Kernel --- arch/x86/xen/xen-asm_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 02f496a8dbaa..53adefda4275 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -96,7 +96,7 @@ ENTRY(xen_sysret32) pushq $__USER32_CS pushq %rcx - pushq $VGCF_in_syscall + pushq $0 1: jmp hypercall_iret ENDPATCH(xen_sysret32) RELOC(xen_sysret32, 1b+1) @@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target) ENTRY(xen_sysenter_target) lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $VGCF_in_syscall + pushq $0 jmp hypercall_iret ENDPROC(xen_syscall32_target) ENDPROC(xen_sysenter_target) -- cgit v1.2.2 From f6eafe3665bcc374c66775d58312d1c06c55303f Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 25 Nov 2009 14:12:08 +0000 Subject: xen: call clock resume notifier on all CPUs tick_resume() is never called on secondary processors. Presumably this is because they are offlined for suspend on native and so this is normally taken care of in the CPU onlining path. Under Xen we keep all CPUs online over a suspend. This patch papers over the issue for me but I will investigate a more generic, less hacky, way of doing to the same. tick_suspend is also only called on the boot CPU which I presume should be fixed too. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Cc: Thomas Gleixner --- arch/x86/xen/suspend.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 6343a5d8e93c..987267f79bf5 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -46,7 +47,19 @@ void xen_post_suspend(int suspend_cancelled) } +static void xen_vcpu_notify_restore(void *data) +{ + unsigned long reason = (unsigned long)data; + + /* Boot processor notified via generic timekeeping_resume() */ + if ( smp_processor_id() == 0) + return; + + clockevents_notify(reason, NULL); +} + void xen_arch_resume(void) { - /* nothing */ + smp_call_function(xen_vcpu_notify_restore, + (void *)CLOCK_EVT_NOTIFY_RESUME, 1); } -- cgit v1.2.2 From af901ca181d92aac3a7dc265144a9081a86d8f39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= Date: Sat, 14 Nov 2009 13:09:05 -0200 Subject: tree-wide: fix assorted typos all over the place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That is "success", "unknown", "through", "performance", "[re|un]mapping" , "access", "default", "reasonable", "[con]currently", "temperature" , "channel", "[un]used", "application", "example","hierarchy", "therefore" , "[over|under]flow", "contiguous", "threshold", "enough" and others. Signed-off-by: André Goddard Rosa Signed-off-by: Jiri Kosina --- arch/x86/include/asm/desc_defs.h | 4 ++-- arch/x86/include/asm/mmzone_32.h | 2 +- arch/x86/include/asm/uv/uv_bau.h | 2 +- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/amd_iommu.c | 4 ++-- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/kprobes.c | 4 ++-- arch/x86/mm/kmmio.c | 4 ++-- 8 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 9d6684849fd9..278441f39856 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -12,9 +12,9 @@ #include /* - * FIXME: Acessing the desc_struct through its fields is more elegant, + * FIXME: Accessing the desc_struct through its fields is more elegant, * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b acessors, and doing this allow us to do it + * still touches the a and b accessors, and doing this allow us to do it * incrementally. We keep the signature as a struct, rather than an union, * so we can get rid of it transparently in the future -- glommer */ diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index ede6998bd92c..91df7c51806c 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {} /* * generic node memory support, the following assumptions apply: * - * 1) memory comes in 64Mb contigious chunks which are either present or not + * 1) memory comes in 64Mb contiguous chunks which are either present or not * 2) we will not have more than 64Gb in total * * for now assume that 64Gb is max amount of RAM for whole system diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 80e2984f521c..b414d2b401f6 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -55,7 +55,7 @@ #define DESC_STATUS_SOURCE_TIMEOUT 3 /* - * source side threshholds at which message retries print a warning + * source side thresholds at which message retries print a warning */ #define SOURCE_TIMEOUT_LIMIT 20 #define DESTINATION_TIMEOUT_LIMIT 20 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 67e929b89875..1c2c4838d35c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1122,7 +1122,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) if (!acpi_sci_override_gsi) acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); - /* Fill in identity legacy mapings where no override */ + /* Fill in identity legacy mappings where no override */ mp_config_acpi_legacy_irqs(); count = diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0285521e0a99..42ac5e000995 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1637,7 +1637,7 @@ retry: goto out; /* - * aperture was sucessfully enlarged by 128 MB, try + * aperture was successfully enlarged by 128 MB, try * allocation again */ goto retry; @@ -2396,7 +2396,7 @@ int __init amd_iommu_init_passthrough(void) struct pci_dev *dev = NULL; u16 devid, devid2; - /* allocate passthroug domain */ + /* allocate passthrough domain */ pt_domain = protection_domain_alloc(); if (!pt_domain) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b5801c311846..35be5802ac1e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1229,7 +1229,7 @@ x86_perf_event_set_period(struct perf_event *event, return 0; /* - * If we are way outside a reasoable range then just skip forward: + * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..7d377379fa4a 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -514,7 +514,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. + * remain disabled throughout this function. */ static int __kprobes kprobe_handler(struct pt_regs *regs) { @@ -851,7 +851,7 @@ no_change: /* * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. + * remain disabled throughout this function. */ static int __kprobes post_kprobe_handler(struct pt_regs *regs) { diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 16ccbd77917f..d16d576beebf 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -203,7 +203,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) */ /* * Interrupts are disabled on entry as trap3 is an interrupt gate - * and they remain disabled thorough out this function. + * and they remain disabled throughout this function. */ int kmmio_handler(struct pt_regs *regs, unsigned long addr) { @@ -302,7 +302,7 @@ no_kmmio: /* * Interrupts are disabled on entry as trap1 is an interrupt gate - * and they remain disabled thorough out this function. + * and they remain disabled throughout this function. * This must always get called as the pair to kmmio_handler(). */ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) -- cgit v1.2.2 From 6070d81eb5f2d4943223c96e7609a53cdc984364 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 4 Dec 2009 15:47:01 -0500 Subject: tree-wide: fix misspelling of "definition" in comments "Definition" is misspelled "defintion" in several comments; this patch fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- arch/x86/include/asm/sigcontext.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 72e5a4491661..04459d25e66e 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -124,7 +124,7 @@ struct sigcontext { * fpstate is really (struct _fpstate *) or (struct _xstate *) * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the defintion of + * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ void __user *fpstate; /* zero when no FPU/extended context */ @@ -219,7 +219,7 @@ struct sigcontext { * fpstate is really (struct _fpstate *) or (struct _xstate *) * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the defintion of + * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ void __user *fpstate; /* zero when no FPU/extended context */ -- cgit v1.2.2 From 575939cf548951dde8df0786899ea5a91bb669b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Nov 2009 18:05:12 -0800 Subject: x86/PCI: claim SR-IOV BARs in pcibios_allocate_resource This allows us to use the BIOS SR-IOV allocations rather than assigning our own later on. Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index b73c09f45210..5dc9e8c63fcd 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -146,16 +146,29 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) } } +struct pci_check_idx_range { + int start; + int end; +}; + static void __init pcibios_allocate_resources(int pass) { struct pci_dev *dev = NULL; - int idx, disabled; + int idx, disabled, i; u16 command; struct resource *r; + struct pci_check_idx_range idx_range[] = { + { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END }, +#ifdef CONFIG_PCI_IOV + { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END }, +#endif + }; + for_each_pci_dev(dev) { pci_read_config_word(dev, PCI_COMMAND, &command); - for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { + for (i = 0; i < ARRAY_SIZE(idx_range); i++) + for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) { r = &dev->resource[idx]; if (r->parent) /* Already allocated */ continue; -- cgit v1.2.2 From 5d990b627537e59a3a2f039ff588a4750e9c1a6a Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Fri, 4 Dec 2009 12:15:21 -0800 Subject: PCI: add pci_request_acs Commit ae21ee65e8bc228416bbcc8a1da01c56a847a60c "PCI: acs p2p upsteram forwarding enabling" doesn't actually enable ACS. Add a function to pci core to allow an IOMMU to request that ACS be enabled. The existing mechanism of using iommu_found() in the pci core to know when ACS should be enabled doesn't actually work due to initialization order; iommu has only been detected not initialized. Have Intel and AMD IOMMUs request ACS, and Xen does as well during early init of dom0. Cc: Allen Kay Cc: David Woodhouse Cc: Jeremy Fitzhardinge Cc: Joerg Roedel Signed-off-by: Chris Wright Signed-off-by: Jesse Barnes --- arch/x86/kernel/amd_iommu_init.c | 2 ++ arch/x86/xen/enlighten.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b4b61d462dcc..e60530a5f524 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1330,6 +1330,8 @@ void __init amd_iommu_detect(void) gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; #endif + /* Make sure ACS will be enabled */ + pci_request_acs(); } } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5bccd706232c..e2511bccbc8d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1170,7 +1171,11 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + } else { + /* Make sure ACS will be enabled */ + pci_request_acs(); } + xen_raw_console_write("about to get started...\n"); -- cgit v1.2.2 From 4832ddda2ec4df96ea1eed334ae2dbd65fc1f541 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Fri, 4 Dec 2009 15:42:22 -0800 Subject: x86: ASUS P4S800 reboot=bios quirk Bug reporter noted their system with an ASUS P4S800 motherboard would hang when rebooting unless reboot=b was specified. Their dmidecode didn't contain descriptive System Information for Manufacturer or Product Name, so I used their Base Board Information to create a reboot quirk patch. The bug reporter confirmed this patch resolves the reboot hang. Handle 0x0001, DMI type 1, 25 bytes System Information Manufacturer: System Manufacturer Product Name: System Name Version: System Version Serial Number: SYS-1234567890 UUID: E0BFCD8B-7948-D911-A953-E486B4EEB67F Wake-up Type: Power Switch Handle 0x0002, DMI type 2, 8 bytes Base Board Information Manufacturer: ASUSTeK Computer INC. Product Name: P4S800 Version: REV 1.xx Serial Number: xxxxxxxxxxx BugLink: http://bugs.launchpad.net/bugs/366682 ASUS P4S800 will hang when rebooting unless reboot=b is specified. Add a quirk to reboot through the bios. Signed-off-by: Leann Ogasawara LKML-Reference: <1259972107.4629.275.camel@emiko> Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f93078746e00..6caf26010970 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -259,6 +259,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, { } }; -- cgit v1.2.2 From a5fc5eba4dfcc284e6adcd7fdcd5b43182230d2b Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 4 Dec 2009 17:44:51 -0800 Subject: x86: Convert BUG() to use unreachable() Use the new unreachable() macro instead of for(;;);. When allyesconfig is built with a GCC-4.5 snapshot on i686 the size of the text segment is reduced by 3987 bytes (from 6827019 to 6823032). Signed-off-by: David Daney Acked-by: "H. Peter Anvin" CC: Thomas Gleixner CC: Ingo Molnar CC: x86@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/bug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index d9cf1cd156d2..f654d1bb17fb 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -22,14 +22,14 @@ do { \ ".popsection" \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (sizeof(struct bug_entry))); \ - for (;;) ; \ + unreachable(); \ } while (0) #else #define BUG() \ do { \ asm volatile("ud2"); \ - for (;;) ; \ + unreachable(); \ } while (0) #endif -- cgit v1.2.2 From 2f0993e0fb663c49e4d1e02654f6203246be4817 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 07:06:10 +0100 Subject: hw-breakpoints: Drop callback and task parameters from modify helper Drop the callback and task parameters from modify_user_hw_breakpoint(). For now we have no user that need to modify a breakpoint to the point of changing its handler or its task context. Signed-off-by: Frederic Weisbecker Cc: "K. Prasad" --- arch/x86/kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 04d182a7cfdb..dbb395572ae2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -618,7 +618,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, attr.bp_type = gen_type; attr.disabled = disabled; - return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + return modify_user_hw_breakpoint(bp, &attr); } /* @@ -740,7 +740,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr = bp->attr; attr.bp_addr = addr; - bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + bp = modify_user_hw_breakpoint(bp, &attr); } /* * CHECKME: the previous code returned -EIO if the addr wasn't a -- cgit v1.2.2 From b326e9560a28fc3e950637ef51847ed8f05c1335 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 09:44:31 +0100 Subject: hw-breakpoints: Use overflow handler instead of the event callback struct perf_event::event callback was called when a breakpoint triggers. But this is a rather opaque callback, pretty tied-only to the breakpoint API and not really integrated into perf as it triggers even when we don't overflow. We prefer to use overflow_handler() as it fits into the perf events rules, being called only when we overflow. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- arch/x86/kernel/hw_breakpoint.c | 5 ++--- arch/x86/kernel/ptrace.c | 9 ++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d42f65ac4927..05d5fec64a94 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -362,8 +362,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - if (bp->callback) - ret = arch_store_info(bp); + ret = arch_store_info(bp); if (ret < 0) return ret; @@ -519,7 +518,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) break; } - (bp->callback)(bp, args->regs); + perf_bp_event(bp, args->regs); rcu_read_unlock(); } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index dbb395572ae2..b361d28061d0 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -555,7 +555,9 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, void *data) +static void ptrace_triggered(struct perf_event *bp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); @@ -599,7 +601,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, { int err; int gen_len, gen_type; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; /* * We shoud have at least an inactive breakpoint at this @@ -721,9 +723,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, { struct perf_event *bp; struct thread_struct *t = &tsk->thread; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; if (!t->ptrace_bps[nr]) { + hw_breakpoint_init(&attr); /* * Put stub len and type to register (reserve) an inactive but * correct bp -- cgit v1.2.2 From 7f33f9c5cc3c99aeaf4d266a7ed502b828115a53 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 12:01:17 +0100 Subject: x86/perf: Exclude the debug stack from the callchains Dumping the callchains from breakpoint events with perf gives strange results: 3.75% perf [kernel] [k] _raw_read_unlock | --- _raw_read_unlock perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_bp_event hw_breakpoint_exceptions_notify notifier_call_chain __atomic_notifier_call_chain atomic_notifier_call_chain notify_die do_debug debug munmap We are infected with all the debug stack. Like the nmi stack, the debug stack is undesired as it is part of the profiling path, not helpful for the user. Ignore it. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- arch/x86/kernel/cpu/perf_event.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c1bbed1021d9..d35f26076ae5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2287,7 +2287,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); +static DEFINE_PER_CPU(int, in_ignored_frame); static void @@ -2303,8 +2303,9 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); + per_cpu(in_ignored_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name) || + x86_is_stack_id(DEBUG_STACK, name); return 0; } @@ -2313,7 +2314,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_nmi_frame, smp_processor_id())) + if (per_cpu(in_ignored_frame, smp_processor_id())) return; if (reliable) -- cgit v1.2.2 From b625b3b3b740e177a1148594cd3ad5ff52f35315 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 6 Dec 2009 00:52:56 +0100 Subject: x86: Fixup wrong debug exception frame link in stacktraces While dumping a stacktrace, the end of the exception stack won't link the frame pointer to the previous stack. The interrupted stack will then be considered as unreliable and ignored by perf, as the frame pointer is unreliable itself. This happens because we overwrite the frame pointer that links to the interrupted frame with the address of the exception stack. This is done in order to reserve space inside. But rbp has been chosen here only because it is not a scratch register, so that the address of the exception stack remains in rbp after calling do_debug(), we can then release the exception stack space without the need to retrieve its address again. But we can pick another non-scratch register to do that, so that we preserve the link to the interrupted stack frame in the stacktraces. Just randomly choose r12. Every registers are saved just before and restored just after calling do_debug(). And r12 is not used in the middle, which makes it a perfect candidate. Example: perf record -g -a -c 1 -f -e mem:$(tasklist_lock_addr):rw Before: 44.18% [k] _raw_read_lock | | --- |--6.31%-- waitid | |--4.26%-- writev | |--3.63%-- __select | |--3.15%-- __waitpid | | | |--28.57%-- 0x8b52e00000139f | | | |--28.57%-- 0x8b52e0000013c6 | | | |--14.29%-- 0x7fde786dc000 | | | |--14.29%-- 0x62696c2f7273752f | | | --14.29%-- 0x1ea9df800000000 | |--3.00%-- __poll After: 43.94% [k] _raw_read_lock | --- _read_lock | |--60.53%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | synaptics_process_byte | psmouse_handle_byte | psmouse_interrupt | serio_interrupt | i8042_interrupt | handle_IRQ_event | handle_edge_irq | handle_irq | __irqentry_text_start | ret_from_intr | | | |--30.43%-- __select | | | |--17.39%-- 0x454f15 | | | |--13.04%-- __read | | | |--13.04%-- vread_hpet | | | |--13.04%-- _xcb_lock_io | | | --13.04%-- 0x7f630878ce87 Note: it does not only affect perf events but also other stacktraces in x86-64. They were considered as unreliable once we quit the debug stack frame. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 722df1b1152d..0f08a0cea3e0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1076,10 +1076,10 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %rbp) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + PER_CPU(init_tss, %r12) + subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) -- cgit v1.2.2 From af2d8289f57e427836be482c6f72cca674028121 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 6 Dec 2009 05:34:27 +0100 Subject: x86: Fixup wrong irq frame link in stacktraces When we enter in irq, two things can happen to preserve the link to the previous frame pointer: - If we were in an irq already, we don't switch to the irq stack as we are inside. We just need to save the previous frame pointer and to link the new one to the previous. - Otherwise we need another level of indirection. We enter the irq with the previous stack. We save the previous bp inside and make bp pointing to its saved address. Then we switch to the irq stack and push bp another time but to the new stack. This makes two levels to dereference instead of one. In the second case, the current stacktrace code omits the second level and loses the frame pointer accuracy. The stack that follows will then be considered as unreliable. Handling that makes the perf callchain happier. Before: 43.94% [k] _raw_read_lock | --- _read_lock | |--60.53%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | synaptics_process_byte | psmouse_handle_byte | psmouse_interrupt | serio_interrupt | i8042_interrupt | handle_IRQ_event | handle_edge_irq | handle_irq | __irqentry_text_start | ret_from_intr | | | |--30.43%-- __select | | | |--17.39%-- 0x454f15 | | | |--13.04%-- __read | | | |--13.04%-- vread_hpet | | | |--13.04%-- _xcb_lock_io | | | --13.04%-- 0x7f630878ce8 After: 50.00% [k] _raw_read_lock | --- _read_lock | |--98.97%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | | | |--96.88%-- synaptics_process_byte | | psmouse_handle_byte | | psmouse_interrupt | | serio_interrupt | | i8042_interrupt | | handle_IRQ_event | | handle_edge_irq | | handle_irq | | __irqentry_text_start | | ret_from_intr | | | | | |--39.78%-- __const_udelay | | | | | | | |--91.89%-- ath5k_hw_register_timeout | | | | ath5k_hw_noise_floor_calibration | | | | ath5k_hw_reset | | | | ath5k_reset | | | | ath5k_config | | | | ieee80211_hw_config | | | | | | | | | |--88.24%-- ieee80211_scan_work | | | | | worker_thread | | | | | kthread | | | | | child_rip | | | | | | | | | --11.76%-- ieee80211_scan_completed | | | | ieee80211_scan_work | | | | worker_thread | | | | kthread | | | | child_rip | | | | | | | --8.11%-- ath5k_hw_noise_floor_calibration | | | ath5k_hw_reset | | | ath5k_reset | | | ath5k_config Note: This does not only affect perf events but also x86-64 stacktraces. They were considered as unreliable once we quit the irq stack frame. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/kernel/dumpstack_64.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a071e6be177e..004b8aa6a35f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -101,6 +101,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } +static inline int +in_irq_stack(unsigned long *stack, unsigned long *irq_stack, + unsigned long *irq_stack_end) +{ + return (stack >= irq_stack && stack < irq_stack_end); +} + +/* + * We are returning from the irq stack and go to the previous one. + * If the previous stack is also in the irq stack, then bp in the first + * frame of the irq stack points to the previous, interrupted one. + * Otherwise we have another level of indirection: We first save + * the bp of the previous stack, then we switch the stack to the irq one + * and save a new bp that links to the previous one. + * (See save_args()) + */ +static inline unsigned long +fixup_bp_irq_link(unsigned long bp, unsigned long *stack, + unsigned long *irq_stack, unsigned long *irq_stack_end) +{ +#ifdef CONFIG_FRAME_POINTER + struct stack_frame *frame = (struct stack_frame *)bp; + + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) + return (unsigned long)frame->next_frame; +#endif + return bp; +} + /* * x86-64 can have up to three kernel stacks: * process stack @@ -173,7 +202,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, irq_stack = irq_stack_end - (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - if (stack >= irq_stack && stack < irq_stack_end) { + if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, @@ -184,6 +213,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); + bp = fixup_bp_irq_link(bp, stack, irq_stack, + irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; -- cgit v1.2.2 From 8055039c2a2454c7159dcbde3161943b757a6e0e Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 5 Dec 2009 10:41:34 -0500 Subject: x86: Fix typo in arch/x86/mm/kmmio.c Signed-off-by: Shaun Patterson Cc: Jiri Kosina Cc: pq@iki.fi LKML-Reference: <1260027694.10074.170.camel@linux-4lgc.site> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 16ccbd77917f..72f157247ab1 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -490,7 +490,7 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) * 2. remove_kmmio_fault_pages() * Remove the pages from kmmio_page_table. * 3. rcu_free_kmmio_fault_pages() - * Actally free the kmmio_fault_page structs as with RCU. + * Actually free the kmmio_fault_page structs as with RCU. */ void unregister_kmmio_probe(struct kmmio_probe *p) { -- cgit v1.2.2 From be2bf0a2dfbba785860284968fa055006eb1610e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Sun, 6 Dec 2009 12:40:59 +0100 Subject: x86, perf probe: Fix warning in test_get_len() Fix the following warning: arch/x86/tools/test_get_len.c: In function "main": arch/x86/tools/test_get_len.c:116: warning: unused variable "c" Signed-off-by: Jean Delvare Cc: Masami Hiramatsu Signed-off-by: Ingo Molnar --- arch/x86/tools/test_get_len.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index d8214dc03fa7..bee8d6ac2691 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -113,7 +113,7 @@ int main(int argc, char **argv) char line[BUFSIZE], sym[BUFSIZE] = ""; unsigned char insn_buf[16]; struct insn insn; - int insns = 0, c; + int insns = 0; int warnings = 0; parse_args(argc, argv); -- cgit v1.2.2 From cbe5c34c8c1f8ed1afbe6273f4ad57fcfad7822f Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 6 Dec 2009 20:14:29 +0900 Subject: x86: Compile insn.c and inat.c only for KPROBES At least, insn.c and inat.c is needed for kprobe for now. So, this compile those only if KPROBES is enabled. Signed-off-by: OGAWA Hirofumi Cc: Masami Hiramatsu LKML-Reference: <878wdg8icq.fsf@devron.myhome.or.jp> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 4 ++-- arch/x86/lib/Makefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 7d0b681a132b..0e90929da40f 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -187,8 +187,8 @@ config HAVE_MMIOTRACE_SUPPORT def_bool y config X86_DECODER_SELFTEST - bool "x86 instruction decoder selftest" - depends on DEBUG_KERNEL + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL && KPROBES ---help--- Perform x86 instruction decoder selftests at build time. This option is useful for checking the sanity of x86 instruction diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index a2d6472895fb..442b3b3b2d80 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -20,7 +20,7 @@ lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o -lib-y += insn.o inat.o +lib-$(CONFIG_KPROBES) += insn.o inat.o obj-y += msr-reg.o msr-reg-export.o -- cgit v1.2.2 From a946d8f11f0da9cfc714248036fcfd3a794d1e27 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Dec 2009 12:59:46 +0100 Subject: x86: Fix bogus warning in apic_noop.apic_write() apic_noop is used to provide dummy apic functions. It's installed when the CPU has no APIC or when the APIC is disabled on the kernel command line. The apic_noop implementation of apic_write() warns when the CPU has an APIC or when the APIC is not disabled. That's bogus. The warning should only happen when the CPU has an APIC _AND_ the APIC is not disabled. apic_noop.apic_read() has the correct check. Signed-off-by: Thomas Gleixner Cc: Cyrill Gorcunov Cc: # in <= .32 this typo resides in native_apic_write_dummy() LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_noop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index d9acc3bee0f4..e31b9ffe25f5 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -127,7 +127,7 @@ static u32 noop_apic_read(u32 reg) static void noop_apic_write(u32 reg, u32 v) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE(cpu_has_apic && !disable_apic); } struct apic apic_noop = { -- cgit v1.2.2 From d32ba45503acf9c23b301eba2397ca2ee322627b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 7 Dec 2009 12:00:33 -0500 Subject: x86 insn: Delete empty or incomplete inat-tables.c Delete empty or incomplete inat-tables.c if gen-insn-attr-x86.awk failed, because it causes a build error if user tries to build kernel next time. Reported-by: Arkadiusz Miskiewicz Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Jens Axboe Cc: Frederic Weisbecker LKML-Reference: <20091207170033.19230.37688.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 442b3b3b2d80..45b20e486c2f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -5,7 +5,7 @@ inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ - cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@ $(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) $(call cmd,inat_tables) -- cgit v1.2.2 From bc09effabf0c5c6c7021e5ef9af15a23579b32a8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 8 Dec 2009 11:21:37 +0900 Subject: x86/mce: Set up timer unconditionally mce_timer must be passed to setup_timer() in all cases, no matter whether it is going to be actually used. Otherwise, when the CPU gets brought down, its call to del_timer_sync() will never return, as the timer won't have a base associated, and hence lock_timer_base() will loop infinitely. Signed-off-by: Jan Beulich Signed-off-by: Hidetoshi Seto Cc: LKML-Reference: <4B1DB831.2030801@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d7ebf25d10ed..a96e5cd256a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1388,13 +1388,14 @@ static void __mcheck_cpu_init_timer(void) struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(mce_next_interval); + setup_timer(t, mce_start_timer, smp_processor_id()); + if (mce_ignore_ce) return; *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } -- cgit v1.2.2 From 5c0e9f28da84c68ce0ae68b7a75faaf862e156e2 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 8 Dec 2009 16:52:44 +0900 Subject: x86, mce: fix confusion between bank attributes and mce attributes Commit cebe182033f156b430952370fb0f9dbe6e89b081 had an unnecessary, wrong change: &mce_banks[i].attr is equivalent to the former bank_attrs[i], not to mce_attrs[i]. Signed-off-by: Hidetoshi Seto Acked-by: Andi Kleen LKML-Reference: <4B1E05CC.4040703@jp.fujitsu.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a96e5cd256a9..a8aacd4b513c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1929,7 +1929,7 @@ error2: sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); sysdev_unregister(&per_cpu(mce_dev, cpu)); -- cgit v1.2.2 From f58e1f53de52a70391b6478617311207c7203363 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 8 Dec 2009 22:30:50 -0800 Subject: arch/x86/kernel/microcode*: Use pr_fmt() and remove duplicated KERN_ERR prefix - Use #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Remove "microcode: " prefix from each pr_ - Fix duplicated KERN_ERR prefix - Coalesce pr_ format strings - Add a space after an exclamation point No other change in output. Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: Andreas Herrmann LKML-Reference: <1260340250.27677.191.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 40 ++++++++++++++++----------------- arch/x86/kernel/microcode_core.c | 26 ++++++++++++---------- arch/x86/kernel/microcode_intel.c | 47 +++++++++++++++++---------------------- 3 files changed, 53 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 63123d902103..37542b67c57e 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,6 +13,9 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -81,7 +84,7 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -111,8 +114,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - pr_err(KERN_ERR "microcode: CPU%d: loading of chipset " - "specific code not yet supported\n", cpu); + pr_err("CPU%d: loading of chipset specific code not yet supported\n", + cpu); return 0; } @@ -141,12 +144,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - pr_err("microcode: CPU%d: update failed " - "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); + pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + cpu, mc_amd->hdr.patch_id); return -1; } - pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); + pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -169,15 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - pr_err("microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("microcode: error: size mismatch\n"); + pr_err("error: size mismatch\n"); return NULL; } @@ -206,14 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - pr_err("microcode: failed to allocate equivalent CPU table\n"); + pr_err("failed to allocate equivalent CPU table\n"); return 0; } @@ -246,7 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - pr_err("microcode: failed to create equivalent cpu table\n"); + pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -277,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (!leftover) { vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode " - "update with version 0x%x (current=0x%x)\n", + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); } else { vfree(new_mc); @@ -300,7 +300,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; if (*(u32 *)firmware->data != UCODE_MAGIC) { - pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n", + pr_err("invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)firmware->data); return UCODE_ERROR; } @@ -313,8 +313,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - pr_info("microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); + pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -334,14 +333,13 @@ void init_microcode_amd(struct device *device) WARN_ON(c->x86_vendor != X86_VENDOR_AMD); if (c->x86 < 0x10) { - pr_warning("microcode: AMD CPU family 0x%x not supported\n", - c->x86); + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); return; } supported_cpu = 1; if (request_firmware(&firmware, fw_name, device)) - pr_err("microcode: failed to load file %s\n", fw_name); + pr_err("failed to load file %s\n", fw_name); } void fini_microcode_amd(void) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index e68aae397869..844c02c65fcb 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -209,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); + pr_err("too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -244,7 +247,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -359,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu) if (!uci->mc) return UCODE_NFOUND; - pr_debug("microcode: CPU%d updated upon resume\n", cpu); + pr_debug("CPU%d updated upon resume\n", cpu); apply_microcode_on_target(cpu); return UCODE_OK; @@ -379,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu) ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); if (ustate == UCODE_OK) { - pr_debug("microcode: CPU%d updated upon init\n", cpu); + pr_debug("CPU%d updated upon init\n", cpu); apply_microcode_on_target(cpu); } @@ -406,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) @@ -425,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; @@ -473,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - pr_err("microcode: Failed to create group for CPU%d\n", cpu); + pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); break; case CPU_DEAD: case CPU_UP_CANCELED_FROZEN: @@ -507,7 +510,7 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - pr_err("microcode: no support for this CPU vendor\n"); + pr_err("no support for this CPU vendor\n"); return -ENODEV; } @@ -541,8 +544,7 @@ static int __init microcode_init(void) register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " ," - " Peter Oruba\n"); + " , Peter Oruba\n"); return 0; } diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 0d334ddd0a96..ebd193e476ca 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel " - "processor\n", cpu_num); + pr_err("CPU%d not a capable Intel processor\n", cpu_num); return -1; } @@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); + pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc) data_size = get_datasize(mc_header); if (data_size + MC_HEADER_SIZE > total_size) { - printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); + pr_err("error! Bad data size in microcode data file\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); + pr_err("error! Unknown microcode update format\n"); return -EINVAL; } ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! " - "Small exttable size in microcode data file\n"); + pr_err("error! Small exttable size in microcode data file\n"); return -EINVAL; } ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { - printk(KERN_ERR "microcode: error! " - "Bad exttable size in microcode data file\n"); + pr_err("error! Bad exttable size in microcode data file\n"); return -EFAULT; } ext_sigcount = ext_header->count; @@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc) while (i--) ext_table_sum += ext_tablep[i]; if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, " - "bad extended signature table checksum\n"); + pr_warning("aborting, bad extended signature table checksum\n"); return -EINVAL; } } @@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc) while (i--) orig_sum += ((int *)mc)[i]; if (orig_sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } if (!ext_table_size) @@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc) - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } } @@ -327,13 +324,11 @@ static int apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update " - "to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + pr_err("CPU%d update to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); return -1; } - printk(KERN_INFO "microcode: CPU%d updated to revision " - "0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, @@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc_size = get_totalsize(&mc_header); if (!mc_size || mc_size > leftover) { - printk(KERN_ERR "microcode: error!" - "Bad data in microcode data file\n"); + pr_err("error! Bad data in microcode data file\n"); break; } @@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); out: return state; } @@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) c->x86, c->x86_model, c->x86_mask); if (request_firmware(&firmware, name, device)) { - pr_debug("microcode: data file %s load failed\n", name); + pr_debug("data file %s load failed\n", name); return UCODE_NFOUND; } -- cgit v1.2.2 From 44234adcdce38f83c56e05f808ce656175b4beeb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Dec 2009 09:25:48 +0100 Subject: hw-breakpoints: Modify breakpoints without unregistering them Currently, when ptrace needs to modify a breakpoint, like disabling it, changing its address, type or len, it calls modify_user_hw_breakpoint(). This latter will perform the heavy and racy task of unregistering the old breakpoint and registering a new one. This is racy as someone else might steal the reserved breakpoint slot under us, which is undesired as the breakpoint is only supposed to be modified, sometimes in the middle of a debugging workflow. We don't want our slot to be stolen in the middle. So instead of unregistering/registering the breakpoint, just disable it while we modify its breakpoint fields and re-enable it after if necessary. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Prasad LKML-Reference: <1260347148-5519-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 57 ++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b361d28061d0..7079ddaf0731 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -595,7 +595,7 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static struct perf_event * +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct task_struct *tsk, int disabled) { @@ -609,11 +609,11 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, * written the address register first */ if (!bp) - return ERR_PTR(-EINVAL); + return -EINVAL; err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (err) - return ERR_PTR(err); + return err; attr = bp->attr; attr.bp_len = gen_len; @@ -658,28 +658,17 @@ restore: if (!second_pass) continue; - thread->ptrace_bps[i] = NULL; - bp = ptrace_modify_breakpoint(bp, len, type, + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 1); - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + if (rc) break; - } - thread->ptrace_bps[i] = bp; } continue; } - bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); - - /* Incorrect bp, or we have a bug in bp API */ - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + if (rc) break; - } - thread->ptrace_bps[i] = bp; } /* * Make a second pass to free the remaining unused breakpoints @@ -737,26 +726,32 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.disabled = 1; bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + + /* + * CHECKME: the previous code returned -EIO if the addr wasn't + * a valid task virtual addr. The new one will return -EINVAL in + * this case. + * -EINVAL may be what we want for in-kernel breakpoints users, + * but -EIO looks better for ptrace, since we refuse a register + * writing for the user. And anyway this is the previous + * behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; } else { + int err; + bp = t->ptrace_bps[nr]; - t->ptrace_bps[nr] = NULL; attr = bp->attr; attr.bp_addr = addr; - bp = modify_user_hw_breakpoint(bp, &attr); + err = modify_user_hw_breakpoint(bp, &attr); + if (err) + return err; } - /* - * CHECKME: the previous code returned -EIO if the addr wasn't a - * valid task virtual addr. The new one will return -EINVAL in this - * case. - * -EINVAL may be what we want for in-kernel breakpoints users, but - * -EIO looks better for ptrace, since we refuse a register writing - * for the user. And anyway this is the previous behaviour. - */ - if (IS_ERR(bp)) - return PTR_ERR(bp); - t->ptrace_bps[nr] = bp; return 0; } -- cgit v1.2.2 From 814e2c84a722c45650a9b8f52285d7ba6874f63b Mon Sep 17 00:00:00 2001 From: Andy Isaacson Date: Tue, 8 Dec 2009 00:29:42 -0800 Subject: x86: Factor duplicated code out of __show_regs() into show_regs_common() Unify x86_32 and x86_64 implementations of __show_regs() header, standardizing on the x86_64 format string in the process. Also, 32-bit will now call print_modules. Signed-off-by: Andy Isaacson Cc: Arjan van de Ven Cc: Robert Hancock Cc: Richard Zidlicky Cc: Andrew Morton LKML-Reference: <20091208082942.GA27174@hexapodia.org> [ v2: resolved conflict ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/system.h | 1 + arch/x86/kernel/process.c | 18 ++++++++++++++++++ arch/x86/kernel/process_32.c | 14 +------------- arch/x86/kernel/process_64.c | 16 ++-------------- 4 files changed, 22 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 022a84386de8..ecb544e65382 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct task_struct *prev, struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); +extern void show_regs_common(void); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e2ba634ea15..90cf1250a005 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include #include @@ -90,6 +92,22 @@ void exit_thread(void) } } +void show_regs_common(void) +{ + const char *board; + + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + + printk("\n"); + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board); +} + void flush_thread(void) { struct task_struct *tsk = current; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 075580b35682..120b88797a75 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -35,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -128,7 +126,6 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; - const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -140,16 +137,7 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk("\n"); - - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", - task_pid_nr(current), current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + show_regs_common(); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c95c8f4e790a..e5ab0cd0ef36 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -38,7 +37,6 @@ #include #include #include -#include #include #include @@ -163,18 +161,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; - const char *board; - - printk("\n"); - print_modules(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + + show_regs_common(); printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, -- cgit v1.2.2 From a1884b8e558ef6395f6033f9e1b69b332dd040e0 Mon Sep 17 00:00:00 2001 From: Andy Isaacson Date: Tue, 8 Dec 2009 00:30:21 -0800 Subject: x86: Print DMI_BOARD_NAME as well as DMI_PRODUCT_NAME from __show_regs() Robert Hancock observes that DMI_BOARD_NAME is often more useful than DMI_PRODUCT_NAME, especially on standalone motherboards. So, print both. Signed-off-by: Andy Isaacson Cc: Arjan van de Ven Cc: Robert Hancock Cc: Richard Zidlicky Cc: Andrew Morton LKML-Reference: <20091208083021.GB27174@hexapodia.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 90cf1250a005..7a7bd4e3ec49 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -94,18 +94,21 @@ void exit_thread(void) void show_regs_common(void) { - const char *board; + const char *board, *product; - board = dmi_get_system_info(DMI_PRODUCT_NAME); + board = dmi_get_system_info(DMI_BOARD_NAME); if (!board) board = ""; + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product) + product = ""; printk("\n"); - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + init_utsname()->version, board, product); } void flush_thread(void) -- cgit v1.2.2 From e258e4e0b495e6ecbd073d6bef1eafb62a58919a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:51 -0500 Subject: x86-32: Add new pt_regs stubs Add new stubs which add the pt_regs pointer as the last arg, matching 64-bit. This will allow these syscalls to be easily merged. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 49 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 50b9c220e121..34dbfa909dd7 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -725,22 +725,49 @@ END(syscall_badsys) /* * System calls that need a pt_regs pointer. */ -#define PTREGSCALL(name) \ +#define PTREGSCALL0(name) \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%eax; \ jmp sys_##name; -PTREGSCALL(iopl) -PTREGSCALL(fork) -PTREGSCALL(clone) -PTREGSCALL(vfork) -PTREGSCALL(execve) -PTREGSCALL(sigaltstack) -PTREGSCALL(sigreturn) -PTREGSCALL(rt_sigreturn) -PTREGSCALL(vm86) -PTREGSCALL(vm86old) +#define PTREGSCALL1(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%edx; \ + movl PT_EBX(%edx),%eax; \ + jmp sys_##name; + +#define PTREGSCALL2(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%ecx; \ + movl PT_ECX(%ecx),%edx; \ + movl PT_EBX(%ecx),%eax; \ + jmp sys_##name; + +#define PTREGSCALL3(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + pushl %eax; \ + movl PT_EDX(%eax),%ecx; \ + movl PT_ECX(%eax),%edx; \ + movl PT_EBX(%eax),%eax; \ + call sys_##name; \ + addl $4,%esp; \ + ret + +PTREGSCALL0(iopl) +PTREGSCALL0(fork) +PTREGSCALL0(clone) +PTREGSCALL0(vfork) +PTREGSCALL0(execve) +PTREGSCALL0(sigaltstack) +PTREGSCALL0(sigreturn) +PTREGSCALL0(rt_sigreturn) +PTREGSCALL0(vm86) +PTREGSCALL0(vm86old) .macro FIXUP_ESPFIX_STACK /* -- cgit v1.2.2 From 27f59559d63375a4d59e7c720a439d9f0b47edad Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:52 -0500 Subject: x86: Merge sys_iopl Change 32-bit sys_iopl to PTREGSCALL1, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 6 +----- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/ioport.c | 28 +++++----------------------- 3 files changed, 7 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 372b76edd63f..4b694cd904c4 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -18,6 +18,7 @@ /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +long sys_iopl(unsigned int, struct pt_regs *); /* kernel/process.c */ int sys_fork(struct pt_regs *); @@ -35,8 +36,6 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 -/* kernel/ioport.c */ -long sys_iopl(struct pt_regs *); /* kernel/process_32.c */ int sys_clone(struct pt_regs *); @@ -70,9 +69,6 @@ int sys_vm86(struct pt_regs *); #else /* CONFIG_X86_32 */ /* X86_64 only */ -/* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned int, struct pt_regs *); - /* kernel/process_64.c */ asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 34dbfa909dd7..ab7fcef37453 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -758,7 +758,7 @@ ptregs_##name: \ addl $4,%esp; \ ret -PTREGSCALL0(iopl) +PTREGSCALL1(iopl) PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 99c4d308f16b..85ecc7c57ba6 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * on system-call entry - see also fork() and the signal handling * code. */ -static int do_iopl(unsigned int level, struct pt_regs *regs) +long sys_iopl(unsigned int level, struct pt_regs *regs) { unsigned int old = (regs->flags >> 12) & 3; + struct thread_struct *t = ¤t->thread; if (level > 3) return -EINVAL; @@ -115,29 +116,10 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); - - return 0; -} - #ifdef CONFIG_X86_32 -long sys_iopl(struct pt_regs *regs) -{ - unsigned int level = regs->bx; - struct thread_struct *t = ¤t->thread; - int rc; - - rc = do_iopl(level, regs); - if (rc < 0) - goto out; - t->iopl = level << 12; set_iopl_mask(t->iopl); -out: - return rc; -} -#else -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - return do_iopl(level, regs); -} #endif + + return 0; +} -- cgit v1.2.2 From 11cf88bd0b8165b65aaabaee0977e9a3ad474ab7 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:53 -0500 Subject: x86: Merge sys_execve Change 32-bit sys_execve to PTREGSCALL3, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 6 ++---- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/process.c | 26 ++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 25 ------------------------- arch/x86/kernel/process_64.c | 19 ------------------- 5 files changed, 29 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 4b694cd904c4..48c48e508b9f 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -23,6 +23,8 @@ long sys_iopl(unsigned int, struct pt_regs *); /* kernel/process.c */ int sys_fork(struct pt_regs *); int sys_vfork(struct pt_regs *); +long sys_execve(char __user *, char __user * __user *, + char __user * __user *, struct pt_regs *); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); @@ -39,7 +41,6 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* kernel/process_32.c */ int sys_clone(struct pt_regs *); -int sys_execve(struct pt_regs *); /* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); @@ -73,9 +74,6 @@ int sys_vm86(struct pt_regs *); asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); -asmlinkage long sys_execve(char __user *, char __user * __user *, - char __user * __user *, - struct pt_regs *); long sys_arch_prctl(int, unsigned long); /* kernel/signal.c */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index ab7fcef37453..a96a0d8a0fdb 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -762,7 +762,7 @@ PTREGSCALL1(iopl) PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) -PTREGSCALL0(execve) +PTREGSCALL3(execve) PTREGSCALL0(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e2ba634ea15..bb17bd9334fb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -235,6 +235,32 @@ int sys_vfork(struct pt_regs *regs) } +/* + * sys_execve() executes a new program. + */ +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs *regs) +{ + long error; + char *filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, regs); + +#ifdef CONFIG_X86_32 + if (error == 0) { + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } +#endif + + putname(filename); + return error; +} + /* * Idle related variables and functions */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 075580b35682..486e38e2900b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -451,31 +451,6 @@ int sys_clone(struct pt_regs *regs) return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); } -/* - * sys_execve() executes a new program. - */ -int sys_execve(struct pt_regs *regs) -{ - int error; - char *filename; - - filename = getname((char __user *) regs->bx); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = do_execve(filename, - (char __user * __user *) regs->cx, - (char __user * __user *) regs->dx, - regs); - if (error == 0) { - /* Make sure we don't return using sysenter.. */ - set_thread_flag(TIF_IRET); - } - putname(filename); -out: - return error; -} - #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c95c8f4e790a..671960d82587 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -520,25 +520,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -/* - * sys_execve() executes a new program. - */ -asmlinkage -long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) -{ - long error; - char *filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, regs); - putname(filename); - return error; -} - void set_personality_64bit(void) { /* inherit personality from parent */ -- cgit v1.2.2 From 052acad48a566a6dbcccb95e5d22e5e1b7cac8dd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:54 -0500 Subject: x86: Merge sys_sigaltstack Change 32-bit sys_sigaltstack to PTREGSCALL2, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-5-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 8 +++----- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/signal.c | 12 +----------- 3 files changed, 5 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 48c48e508b9f..94e0b61fb040 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -31,6 +31,9 @@ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ long sys_rt_sigreturn(struct pt_regs *); +long sys_sigaltstack(const stack_t __user *, stack_t __user *, + struct pt_regs *); + /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); @@ -46,7 +49,6 @@ int sys_clone(struct pt_regs *); asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); -int sys_sigaltstack(struct pt_regs *); unsigned long sys_sigreturn(struct pt_regs *); /* kernel/sys_i386_32.c */ @@ -76,10 +78,6 @@ asmlinkage long sys_clone(unsigned long, unsigned long, struct pt_regs *); long sys_arch_prctl(int, unsigned long); -/* kernel/signal.c */ -asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, - struct pt_regs *); - /* kernel/sys_x86_64.c */ struct new_utsname; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a96a0d8a0fdb..621ef4599416 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -763,7 +763,7 @@ PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) PTREGSCALL3(execve) -PTREGSCALL0(sigaltstack) +PTREGSCALL2(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) PTREGSCALL0(vm86) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 74fe6d86dc5d..4fd173cd8e57 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -545,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, } #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_X86_32 -int sys_sigaltstack(struct pt_regs *regs) -{ - const stack_t __user *uss = (const stack_t __user *)regs->bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long +long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { return do_sigaltstack(uss, uoss, regs->sp); } -#endif /* CONFIG_X86_32 */ /* * Do a signal return; undo the signal stack. -- cgit v1.2.2 From f1382f157fb1175bba008abad0907310a1e459ce Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:55 -0500 Subject: x86, 32-bit: Convert sys_vm86 & sys_vm86old Convert these to new PTREGSCALL stubs. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-6-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 4 ++-- arch/x86/kernel/entry_32.S | 4 ++-- arch/x86/kernel/vm86_32.c | 11 +++++------ 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 94e0b61fb040..df2c51106565 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -66,8 +66,8 @@ asmlinkage int sys_uname(struct old_utsname __user *); asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ -int sys_vm86old(struct pt_regs *); -int sys_vm86(struct pt_regs *); +int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); +int sys_vm86(unsigned long, unsigned long, struct pt_regs *); #else /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 621ef4599416..6c2f25d9b9d5 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -766,8 +766,8 @@ PTREGSCALL3(execve) PTREGSCALL2(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) -PTREGSCALL0(vm86) -PTREGSCALL0(vm86old) +PTREGSCALL2(vm86) +PTREGSCALL1(vm86old) .macro FIXUP_ESPFIX_STACK /* diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9c4e62539058..5ffb5622f793 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -197,9 +197,8 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -int sys_vm86old(struct pt_regs *regs) +int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) { - struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -227,7 +226,7 @@ out: } -int sys_vm86(struct pt_regs *regs) +int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs) struct vm86plus_struct __user *v86; tsk = current; - switch (regs->bx) { + switch (cmd) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); + ret = do_vm86_irq_handling(cmd, (int)arg); goto out; case VM86_PLUS_INSTALL_CHECK: /* @@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs) ret = -EPERM; if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)regs->cx; + v86 = (struct vm86plus_struct __user *)arg; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); -- cgit v1.2.2 From f839bbc5c81b1c92ff8e81c360e9564f7b961b2e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:56 -0500 Subject: x86: Merge sys_clone Change 32-bit sys_clone to new PTREGSCALL stub, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-7-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 8 ++------ arch/x86/kernel/entry_32.S | 14 +++++++++++++- arch/x86/kernel/process.c | 9 +++++++++ arch/x86/kernel/process_32.c | 15 --------------- arch/x86/kernel/process_64.c | 9 --------- 5 files changed, 24 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index df2c51106565..b0ce78061708 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -25,6 +25,8 @@ int sys_fork(struct pt_regs *); int sys_vfork(struct pt_regs *); long sys_execve(char __user *, char __user * __user *, char __user * __user *, struct pt_regs *); +long sys_clone(unsigned long, unsigned long, void __user *, + void __user *, struct pt_regs *); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); @@ -42,9 +44,6 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 -/* kernel/process_32.c */ -int sys_clone(struct pt_regs *); - /* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, @@ -73,9 +72,6 @@ int sys_vm86(unsigned long, unsigned long, struct pt_regs *); /* X86_64 only */ /* kernel/process_64.c */ -asmlinkage long sys_clone(unsigned long, unsigned long, - void __user *, void __user *, - struct pt_regs *); long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6c2f25d9b9d5..6492555d123d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -760,7 +760,6 @@ ptregs_##name: \ PTREGSCALL1(iopl) PTREGSCALL0(fork) -PTREGSCALL0(clone) PTREGSCALL0(vfork) PTREGSCALL3(execve) PTREGSCALL2(sigaltstack) @@ -769,6 +768,19 @@ PTREGSCALL0(rt_sigreturn) PTREGSCALL2(vm86) PTREGSCALL1(vm86old) +/* Clone is an oddball. The 4th arg is in %edi */ + ALIGN; +ptregs_clone: + leal 4(%esp),%eax + pushl %eax + pushl PT_EDI(%eax) + movl PT_EDX(%eax),%ecx + movl PT_ECX(%eax),%edx + movl PT_EBX(%eax),%eax + call sys_clone + addl $8,%esp + ret + .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bb17bd9334fb..f3c1a6b3a65e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -234,6 +234,15 @@ int sys_vfork(struct pt_regs *regs) NULL, NULL); } +long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + /* * sys_execve() executes a new program. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 486e38e2900b..506d5a7ba17c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -436,21 +436,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -int sys_clone(struct pt_regs *regs) -{ - unsigned long clone_flags; - unsigned long newsp; - int __user *parent_tidptr, *child_tidptr; - - clone_flags = regs->bx; - newsp = regs->cx; - parent_tidptr = (int __user *)regs->dx; - child_tidptr = (int __user *)regs->di; - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); -} - #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 671960d82587..83019f94b83d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -534,15 +534,6 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -asmlinkage long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - unsigned long get_wchan(struct task_struct *p) { unsigned long stack; -- cgit v1.2.2 From ce9119ad90b1caba550447bfcc0a21850558ca49 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 9 Dec 2009 16:33:44 -0800 Subject: x86-32: Avoid pipeline serialization in PTREGSCALL1 and 2 In the PTREGSCALL1 and 2 macros, we can trivially avoid an unnecessary pipeline serialization, so do so. In PTREGSCALLS3 this is much less clear-cut since we have to push a new value to the stack. Leave it alone for now assuming it is as good as it is going to be; may want to check on Atom or another in-order x86 to see if we can do better. Signed-off-by: H. Peter Anvin Cc: Brian Gerst LKML-Reference: <1260403316-5679-2-git-send-email-brgerst@gmail.com> --- arch/x86/kernel/entry_32.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6492555d123d..cb12b9bfc9cc 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -735,15 +735,15 @@ ptregs_##name: \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%edx; \ - movl PT_EBX(%edx),%eax; \ + movl (PT_EBX+4)(%esp),%eax; \ jmp sys_##name; #define PTREGSCALL2(name) \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%ecx; \ - movl PT_ECX(%ecx),%edx; \ - movl PT_EBX(%ecx),%eax; \ + movl (PT_ECX+4)(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ jmp sys_##name; #define PTREGSCALL3(name) \ -- cgit v1.2.2 From fc380ceed7fe469728ea4acdbda4495ea943ee1c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 9 Dec 2009 16:54:08 -0800 Subject: x86-64, paravirt: Call set_iopl_mask() on 64 bits set_iopl_mask() is a no-op on 64 bits, but it is also a paravirt hook, so call it even on 64 bits. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Brian Gerst LKML-Reference: <1260403316-5679-3-git-send-email-brgerst@gmail.com> --- arch/x86/kernel/ioport.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 85ecc7c57ba6..8eec0ec59af2 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -116,10 +116,8 @@ long sys_iopl(unsigned int level, struct pt_regs *regs) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); -#ifdef CONFIG_X86_32 t->iopl = level << 12; set_iopl_mask(t->iopl); -#endif return 0; } -- cgit v1.2.2 From 5cd476effe9570d2e520cf904d51f5c992972647 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 Dec 2009 10:45:33 -0800 Subject: x86: es7000_32.c: Use pr_ and add pr_fmt(fmt) - Added #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Converted a few printk(KERN_INFO to pr_info( - Stripped "es7000_mipcfg" from pr_debug Signed-off-by: Joe Perches LKML-Reference: <3b4375af246dec5941168858910210937c110af9.1260383912.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e85f8fb7f8e7..dd2b5f264643 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -27,6 +27,9 @@ * * http://www.unisys.com */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr) mip_addr = val; mip = (struct mip_reg *)val; mip_reg = __va(mip); - pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + pr_debug("host_reg = 0x%lx\n", (unsigned long)host_reg); - pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + pr_debug("mip_reg = 0x%lx\n", (unsigned long)mip_reg); success++; break; @@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void) if (!es7000_plat) return; - printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); + pr_info("Enabling APIC mode.\n"); memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); es7000_mip_reg.off_0x00 = MIP_SW_APIC; es7000_mip_reg.off_0x38 = MIP_VALID; @@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void) { int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - printk(KERN_INFO - "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? "Physical Cluster" : "Logical Cluster", nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); -- cgit v1.2.2 From 40685236b3161b80030a4df34bcbc5941ea59876 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 Dec 2009 10:45:34 -0800 Subject: x86: setup_percpu.c: Use pr_ and add pr_fmt(fmt) - Added #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Stripped PERCPU: from a pr_warning Signed-off-by: Joe Perches LKML-Reference: <7ead24eccbea8f2b11795abad3e2893a98e1e111.1260383912.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d559af913e1f..35abcb8b00e9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -20,9 +22,9 @@ #include #ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(x...) printk(KERN_DEBUG x) +# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__) #else -# define DBG(x...) +# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) #endif DEFINE_PER_CPU(int, cpu_number); @@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } else { ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), size, align, goal); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", + cpu, size, node, __pa(ptr)); } return ptr; #else @@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", + pr_warning("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) -- cgit v1.2.2 From a78d9626f4f6fa7904bfdb071205080743125983 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 Dec 2009 10:45:35 -0800 Subject: x86: i8254.c: Add pr_fmt(fmt) - Add pr_fmt(fmt) "pit: " fmt - Strip pit: prefixes from pr_debug Signed-off-by: Joe Perches LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kvm/i8254.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index fab7440c9bb2..296aba49472a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -29,6 +29,8 @@ * Based on QEMU and Xen. */ +#define pr_fmt(fmt) "pit: " fmt + #include #include "irq.h" @@ -262,7 +264,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) static void destroy_pit_timer(struct kvm_timer *pt) { - pr_debug("pit: execute del timer!\n"); + pr_debug("execute del timer!\n"); hrtimer_cancel(&pt->timer); } @@ -284,7 +286,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); - pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); + pr_debug("create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); @@ -309,7 +311,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) WARN_ON(!mutex_is_locked(&ps->lock)); - pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); + pr_debug("load_count val is %d, channel is %d\n", val, channel); /* * The largest possible initial count is 0; this is equivalent @@ -395,8 +397,8 @@ static int pit_ioport_write(struct kvm_io_device *this, mutex_lock(&pit_state->lock); if (val != 0) - pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", - (unsigned int)addr, len, val); + pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n", + (unsigned int)addr, len, val); if (addr == 3) { channel = val >> 6; -- cgit v1.2.2 From 1bd591a5f17f546121fcf0015d72cc3e9c49cc29 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 Dec 2009 10:45:36 -0800 Subject: x86: kmmio.c: Add and use pr_fmt(fmt) - Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Strip "kmmio: " from pr_s Signed-off-by: Joe Perches LKML-Reference: <7aa509f8a23933036d39f54bd51e9acc52068049.1260383912.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 88612cdcdc3b..68c3e89af5c2 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -5,6 +5,8 @@ * 2008 Pekka Paalanen */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -136,7 +138,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) pte_t *pte = lookup_address(f->page, &level); if (!pte) { - pr_err("kmmio: no pte for page 0x%08lx\n", f->page); + pr_err("no pte for page 0x%08lx\n", f->page); return -1; } @@ -148,7 +150,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) clear_pte_presence(pte, clear, &f->old_presence); break; default: - pr_err("kmmio: unexpected page level 0x%x.\n", level); + pr_err("unexpected page level 0x%x.\n", level); return -1; } @@ -170,13 +172,14 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) static int arm_kmmio_fault_page(struct kmmio_fault_page *f) { int ret; - WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); + WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, !!f->old_presence); + pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", + f->page, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); - WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), + f->page); f->armed = true; return ret; } @@ -240,24 +243,21 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) * condition needs handling by do_page_fault(), the * page really not being present is the most common. */ - pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", - addr, smp_processor_id()); + pr_debug("secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); if (!faultpage->old_presence) - pr_info("kmmio: unexpected secondary hit for " - "address 0x%08lx on CPU %d.\n", addr, - smp_processor_id()); + pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n", + addr, smp_processor_id()); } else { /* * Prevent overwriting already in-flight context. * This should not happen, let's hope disarming at * least prevents a panic. */ - pr_emerg("kmmio: recursive probe hit on CPU %d, " - "for address 0x%08lx. Ignoring.\n", - smp_processor_id(), addr); - pr_emerg("kmmio: previous hit was at 0x%08lx.\n", - ctx->addr); + pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n", + smp_processor_id(), addr); + pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); disarm_kmmio_fault_page(faultpage); } goto no_kmmio_ctx; @@ -316,8 +316,8 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * something external causing them (f.e. using a debugger while * mmio tracing enabled), or erroneous behaviour */ - pr_warning("kmmio: unexpected debug trap on CPU %d.\n", - smp_processor_id()); + pr_warning("unexpected debug trap on CPU %d.\n", + smp_processor_id()); goto out; } @@ -425,7 +425,7 @@ int register_kmmio_probe(struct kmmio_probe *p) list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) - pr_err("kmmio: Unable to set page fault.\n"); + pr_err("Unable to set page fault.\n"); size += PAGE_SIZE; } out: @@ -511,7 +511,7 @@ void unregister_kmmio_probe(struct kmmio_probe *p) drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); if (!drelease) { - pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); + pr_crit("leaking kmmio_fault_page objects.\n"); return; } drelease->release_list = release_list; -- cgit v1.2.2 From 3a0340be06a9356eb61f6804107480acbe62c069 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 Dec 2009 10:45:37 -0800 Subject: x86: mmio-mod.c: Use pr_fmt - Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Remove #define NAME - Remove NAME from pr_ Signed-off-by: Joe Perches LKML-Reference: <009cb214c45ef932df0242856228f4739cc91408.1260383912.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 71 +++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 132772a8ec57..4c765e9c4664 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -19,6 +19,9 @@ * * Derived from the read-mod example from relay-examples by Tom Zanussi. */ + +#define pr_fmt(fmt) "mmiotrace: " + #define DEBUG 1 #include @@ -36,8 +39,6 @@ #include "pf_in.h" -#define NAME "mmiotrace: " - struct trap_reason { unsigned long addr; unsigned long ip; @@ -96,17 +97,18 @@ static void print_pte(unsigned long address) pte_t *pte = lookup_address(address, &level); if (!pte) { - pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", - __func__, address); + pr_err("Error in %s: no pte for page 0x%08lx\n", + __func__, address); return; } if (level == PG_LEVEL_2M) { - pr_emerg(NAME "4MB pages are not currently supported: " - "0x%08lx\n", address); + pr_emerg("4MB pages are not currently supported: 0x%08lx\n", + address); BUG(); } - pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, + pr_info("pte for 0x%lx: 0x%llx 0x%llx\n", + address, (unsigned long long)pte_val(*pte), (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); } @@ -118,22 +120,21 @@ static void print_pte(unsigned long address) static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) { const struct trap_reason *my_reason = &get_cpu_var(pf_reason); - pr_emerg(NAME "unexpected fault for address: 0x%08lx, " - "last fault for address: 0x%08lx\n", - addr, my_reason->addr); + pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n", + addr, my_reason->addr); print_pte(addr); print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); #ifdef __i386__ pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); + regs->ax, regs->bx, regs->cx, regs->dx); pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); + regs->si, regs->di, regs->bp, regs->sp); #else pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", - regs->ax, regs->cx, regs->dx); + regs->ax, regs->cx, regs->dx); pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", - regs->si, regs->di, regs->bp, regs->sp); + regs->si, regs->di, regs->bp, regs->sp); #endif put_cpu_var(pf_reason); BUG(); @@ -213,7 +214,7 @@ static void post(struct kmmio_probe *p, unsigned long condition, /* this should always return the active_trace count to 0 */ my_reason->active_traces--; if (my_reason->active_traces) { - pr_emerg(NAME "unexpected post handler"); + pr_emerg("unexpected post handler"); BUG(); } @@ -244,7 +245,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, }; if (!trace) { - pr_err(NAME "kmalloc failed in ioremap\n"); + pr_err("kmalloc failed in ioremap\n"); return; } @@ -282,8 +283,8 @@ void mmiotrace_ioremap(resource_size_t offset, unsigned long size, if (!is_enabled()) /* recheck and proper locking in *_core() */ return; - pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", - (unsigned long long)offset, size, addr); + pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); if ((filter_offset) && (offset != filter_offset)) return; ioremap_trace_core(offset, size, addr); @@ -301,7 +302,7 @@ static void iounmap_trace_core(volatile void __iomem *addr) struct remap_trace *tmp; struct remap_trace *found_trace = NULL; - pr_debug(NAME "Unmapping %p.\n", addr); + pr_debug("Unmapping %p.\n", addr); spin_lock_irq(&trace_lock); if (!is_enabled()) @@ -363,9 +364,8 @@ static void clear_trace_list(void) * Caller also ensures is_enabled() cannot change. */ list_for_each_entry(trace, &trace_list, list) { - pr_notice(NAME "purging non-iounmapped " - "trace @0x%08lx, size 0x%lx.\n", - trace->probe.addr, trace->probe.len); + pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); if (!nommiotrace) unregister_kmmio_probe(&trace->probe); } @@ -387,7 +387,7 @@ static void enter_uniprocessor(void) if (downed_cpus == NULL && !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { - pr_notice(NAME "Failed to allocate mask\n"); + pr_notice("Failed to allocate mask\n"); goto out; } @@ -395,20 +395,19 @@ static void enter_uniprocessor(void) cpumask_copy(downed_cpus, cpu_online_mask); cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); if (num_online_cpus() > 1) - pr_notice(NAME "Disabling non-boot CPUs...\n"); + pr_notice("Disabling non-boot CPUs...\n"); put_online_cpus(); for_each_cpu(cpu, downed_cpus) { err = cpu_down(cpu); if (!err) - pr_info(NAME "CPU%d is down.\n", cpu); + pr_info("CPU%d is down.\n", cpu); else - pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); + pr_err("Error taking CPU%d down: %d\n", cpu, err); } out: if (num_online_cpus() > 1) - pr_warning(NAME "multiple CPUs still online, " - "may miss events.\n"); + pr_warning("multiple CPUs still online, may miss events.\n"); } /* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, @@ -420,13 +419,13 @@ static void __ref leave_uniprocessor(void) if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) return; - pr_notice(NAME "Re-enabling CPUs...\n"); + pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { err = cpu_up(cpu); if (!err) - pr_info(NAME "enabled CPU%d.\n", cpu); + pr_info("enabled CPU%d.\n", cpu); else - pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); + pr_err("cannot re-enable CPU%d: %d\n", cpu, err); } } @@ -434,8 +433,8 @@ static void __ref leave_uniprocessor(void) static void enter_uniprocessor(void) { if (num_online_cpus() > 1) - pr_warning(NAME "multiple CPUs are online, may miss events. " - "Suggest booting with maxcpus=1 kernel argument.\n"); + pr_warning("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); } static void leave_uniprocessor(void) @@ -450,13 +449,13 @@ void enable_mmiotrace(void) goto out; if (nommiotrace) - pr_info(NAME "MMIO tracing disabled.\n"); + pr_info("MMIO tracing disabled.\n"); kmmio_init(); enter_uniprocessor(); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); spin_unlock_irq(&trace_lock); - pr_info(NAME "enabled.\n"); + pr_info("enabled.\n"); out: mutex_unlock(&mmiotrace_mutex); } @@ -475,7 +474,7 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); kmmio_cleanup(); - pr_info(NAME "disabled.\n"); + pr_info("disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); } -- cgit v1.2.2 From b7cc9554bc73641c9ed4d7eb74b2d6e78f20abea Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 10 Dec 2009 11:03:39 +0100 Subject: x86/amd-iommu: Fix passthrough mode The data structure changes to use dev->archdata.iommu field broke the iommu=pt mode because in this case the dev->archdata.iommu was left uninitialized. This moves the inititalization of the devices into the main init function and fixes the problem. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_proto.h | 3 ++- arch/x86/kernel/amd_iommu.c | 39 ++++++++++++++++++++++++++++++++-- arch/x86/kernel/amd_iommu_init.c | 7 ++++++ 3 files changed, 46 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 84786fb9a23b..2566e2606224 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -28,7 +28,8 @@ extern void amd_iommu_flush_all_domains(void); extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_apply_erratum_63(u16 devid); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); - +extern int amd_iommu_init_devices(void); +extern void amd_iommu_uninit_devices(void); #ifndef CONFIG_AMD_IOMMU_STATS static inline void amd_iommu_stats_init(void) { } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 32fb09102a13..450dd6ac03d3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -166,6 +166,43 @@ static void iommu_uninit_device(struct device *dev) { kfree(dev->archdata.iommu); } + +void __init amd_iommu_uninit_devices(void) +{ + struct pci_dev *pdev = NULL; + + for_each_pci_dev(pdev) { + + if (!check_device(&pdev->dev)) + continue; + + iommu_uninit_device(&pdev->dev); + } +} + +int __init amd_iommu_init_devices(void) +{ + struct pci_dev *pdev = NULL; + int ret = 0; + + for_each_pci_dev(pdev) { + + if (!check_device(&pdev->dev)) + continue; + + ret = iommu_init_device(&pdev->dev); + if (ret) + goto out_free; + } + + return 0; + +out_free: + + amd_iommu_uninit_devices(); + + return ret; +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -2145,8 +2182,6 @@ static void prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; - iommu_init_device(&dev->dev); - /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 7ffc39965233..df01c691d130 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1274,6 +1274,10 @@ static int __init amd_iommu_init(void) if (ret) goto free; + ret = amd_iommu_init_devices(); + if (ret) + goto free; + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else @@ -1296,6 +1300,9 @@ out: return ret; free: + + amd_iommu_uninit_devices(); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); -- cgit v1.2.2 From 8638c4914f34fedc1c13b1cc13f6d1e5a78c46b4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 10 Dec 2009 11:12:25 +0100 Subject: x86/amd-iommu: Fix PCI hotplug with passthrough mode The device change notifier is initialized in the dma_ops initialization path. But this path is never executed for iommu=pt. Move the notifier initialization to IOMMU hardware init code to fix this. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_proto.h | 1 + arch/x86/kernel/amd_iommu.c | 7 +++++-- arch/x86/kernel/amd_iommu_init.c | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 2566e2606224..4d817f9e6e77 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -30,6 +30,7 @@ extern void amd_iommu_apply_erratum_63(u16 devid); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); extern int amd_iommu_init_devices(void); extern void amd_iommu_uninit_devices(void); +extern void amd_iommu_init_notifier(void); #ifndef CONFIG_AMD_IOMMU_STATS static inline void amd_iommu_stats_init(void) { } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 450dd6ac03d3..a83185080e91 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1624,6 +1624,11 @@ static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; +void amd_iommu_init_notifier(void) +{ + bus_register_notifier(&pci_bus_type, &device_nb); +} + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -2250,8 +2255,6 @@ int __init amd_iommu_init_dma_ops(void) register_iommu(&amd_iommu_ops); - bus_register_notifier(&pci_bus_type, &device_nb); - amd_iommu_stats_init(); return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index df01c691d130..309a52f96e0b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1285,6 +1285,8 @@ static int __init amd_iommu_init(void) if (ret) goto free; + amd_iommu_init_notifier(); + enable_iommus(); if (iommu_pass_through) -- cgit v1.2.2 From 6b2f3d1f769be5779b479c37800229d9a4809fc3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Oct 2009 11:05:28 +0100 Subject: vfs: Implement proper O_SYNC semantics While Linux provided an O_SYNC flag basically since day 1, it took until Linux 2.4.0-test12pre2 to actually get it implemented for filesystems, since that day we had generic_osync_around with only minor changes and the great "For now, when the user asks for O_SYNC, we'll actually give O_DSYNC" comment. This patch intends to actually give us real O_SYNC semantics in addition to the O_DSYNC semantics. After Jan's O_SYNC patches which are required before this patch it's actually surprisingly simple, we just need to figure out when to set the datasync flag to vfs_fsync_range and when not. This patch renames the existing O_SYNC flag to O_DSYNC while keeping it's numerical value to keep binary compatibility, and adds a new real O_SYNC flag. To guarantee backwards compatiblity it is defined as expanding to both the O_DSYNC and the new additional binary flag (__O_SYNC) to make sure we are backwards-compatible when compiled against the new headers. This also means that all places that don't care about the differences can just check O_DSYNC and get the right behaviour for O_SYNC, too - only places that actuall care need to check __O_SYNC in addition. Drivers and network filesystems have been updated in a fail safe way to always do the full sync magic if O_DSYNC is set. The few places setting O_SYNC for lower layers are kept that way for now to stay failsafe. We enforce that O_DSYNC is set when __O_SYNC is set early in the open path to make sure we always get these sane options. Note that parisc really screwed up their headers as they already define a O_DSYNC that has always been a no-op. We try to repair it by using it for the new O_DSYNC and redefinining O_SYNC to send both the traditional O_SYNC numerical value _and_ the O_DSYNC one. Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Grant Grundler Cc: "David S. Miller" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Al Viro Cc: Andreas Dilger Acked-by: Trond Myklebust Acked-by: Kyle McMartin Acked-by: Ulrich Drepper Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Jan Kara --- arch/x86/mm/pat.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 66b55d6e69ed..ae9648eb1c7f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -704,9 +704,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (!range_is_allowed(pfn, size)) return 0; - if (file->f_flags & O_SYNC) { + if (file->f_flags & O_DSYNC) flags = _PAGE_CACHE_UC_MINUS; - } #ifdef CONFIG_X86_32 /* -- cgit v1.2.2 From 5e855db5d8fec44e6604eb245aa9077bbd3f0d05 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 10 Dec 2009 17:08:54 +0800 Subject: perf_event: Fix variable initialization in other codepaths Signed-off-by: Xiao Guangrong Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B20BAA6.7010609@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d35f26076ae5..1342f236e32a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1632,6 +1632,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) data.period = event->hw.last_period; data.addr = 0; + data.raw = NULL; regs.ip = 0; /* @@ -1749,6 +1750,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1794,6 +1796,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 ack, status; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1857,6 +1860,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); -- cgit v1.2.2 From 125580380f418000b1a06d9a54700f1191b6e561 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 10 Dec 2009 19:56:34 +0300 Subject: x86, perf events: Check if we have APIC enabled Ralf Hildebrandt reported this boot warning: | Running a vanilla 2.6.32 as Xen DomU, I'm getting: | | [ 0.000999] CPU: Physical Processor ID: 0 | [ 0.000999] CPU: Processor Core ID: 1 | [ 0.000999] Performance Events: AMD PMU driver. | [ 0.000999] ------------[ cut here ]------------ | [ 0.000999] WARNING: at arch/x86/kernel/apic/apic.c:249 native_apic_write_dummy So we need to check if APIC functionality is available, and not just in the P6 driver but elsewhere as well. Reported-by: Ralf Hildebrandt Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091210165634.GF5086@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1342f236e32a..18f05eccbb62 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2066,12 +2066,6 @@ static __init int p6_pmu_init(void) x86_pmu = p6_pmu; - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } - return 0; } @@ -2163,6 +2157,16 @@ static __init int amd_pmu_init(void) return 0; } +static void __init pmu_check_apic(void) +{ + if (cpu_has_apic) + return; + + x86_pmu.apic = 0; + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); +} + void __init init_hw_perf_events(void) { int err; @@ -2184,6 +2188,8 @@ void __init init_hw_perf_events(void) return; } + pmu_check_apic(); + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { -- cgit v1.2.2 From 3bd95dfb182969dc6d2a317c150e0df7107608d3 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:40 -0500 Subject: x86, 64-bit: Move kernel_thread to C Prepare for merging with 32-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 49 +++------------------------------------- arch/x86/kernel/process_64.c | 31 +++++++++++++++++++++++-- arch/x86/kernel/x8664_ksyms_64.c | 2 -- 3 files changed, 32 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 63bca794c8f9..73d9b2c0e217 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1166,63 +1166,20 @@ bad_gs: jmp 2b .previous -/* - * Create a kernel thread. - * - * C extern interface: - * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) - * - * asm input arguments: - * rdi: fn, rsi: arg, rdx: flags - */ -ENTRY(kernel_thread) - CFI_STARTPROC - FAKE_STACK_FRAME $child_rip - SAVE_ALL - - # rdi: flags, rsi: usp, rdx: will be &pt_regs - movq %rdx,%rdi - orq kernel_thread_flags(%rip),%rdi - movq $-1, %rsi - movq %rsp, %rdx - - xorl %r8d,%r8d - xorl %r9d,%r9d - - # clone now - call do_fork - movq %rax,RAX(%rsp) - xorl %edi,%edi - - /* - * It isn't worth to check for reschedule here, - * so internally to the x86_64 port you can rely on kernel_thread() - * not to reschedule the child before returning, this avoids the need - * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] - */ - RESTORE_ALL - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC -END(kernel_thread) - -ENTRY(child_rip) +ENTRY(kernel_thread_helper) pushq $0 # fake return address CFI_STARTPROC /* * Here we are in the child and the registers are set as they were * at kernel_thread() invocation in the parent. */ - movq %rdi, %rax - movq %rsi, %rdi - call *%rax + call *%rsi # exit mov %eax, %edi call do_exit ud2 # padding for call trace CFI_ENDPROC -END(child_rip) +END(kernel_thread_helper) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 83019f94b83d..92484c2130c6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -59,8 +59,6 @@ asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); -unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) @@ -231,6 +229,35 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS; + regs.flags = X86_EFLAGS_IF; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, ~0UL, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a1029769b6f2..9fafaf83b3b8 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -17,8 +17,6 @@ EXPORT_SYMBOL(mcount); #endif -EXPORT_SYMBOL(kernel_thread); - EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); -- cgit v1.2.2 From fa4b8f84383ae197e643a46c36bf58ab8dffc95c Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:41 -0500 Subject: x86, 64-bit: Use user_mode() to determine new stack pointer in copy_thread() Use user_mode() instead of a magic value for sp to determine when returning to kernel mode. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 92484c2130c6..00ac66fa5c6b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -254,7 +254,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.flags = X86_EFLAGS_IF; /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, ~0UL, ®s, 0, NULL, NULL); + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } EXPORT_SYMBOL(kernel_thread); @@ -312,8 +312,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, *childregs = *regs; childregs->ax = 0; - childregs->sp = sp; - if (sp == ~0UL) + if (user_mode(regs)) + childregs->sp = sp; + else childregs->sp = (unsigned long)childregs; p->thread.sp = (unsigned long) childregs; -- cgit v1.2.2 From e840227c141116171c89ab1abb5cc9fee6fdb488 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:42 -0500 Subject: x86, 32-bit: Use same regs as 64-bit for kernel_thread_helper The arg should be in %eax, but that is clobbered by the return value of clone. The function pointer can be in any register. Also, don't push args onto the stack, since regparm(3) is the normal calling convention now. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 8 ++------ arch/x86/kernel/process_32.c | 8 ++++---- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cb12b9bfc9cc..44a8e0dc6737 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1047,12 +1047,8 @@ END(spurious_interrupt_bug) ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder CFI_STARTPROC - movl %edx,%eax - push %edx - CFI_ADJUST_CFA_OFFSET 4 - call *%ebx - push %eax - CFI_ADJUST_CFA_OFFSET 4 + movl %edi,%eax + call *%esi call do_exit ud2 # padding for call trace CFI_ENDPROC diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 506d5a7ba17c..bd874d2b6ab1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -193,8 +193,8 @@ void show_regs(struct pt_regs *regs) } /* - * This gets run with %bx containing the - * function to call, and %dx containing + * This gets run with %si containing the + * function to call, and %di containing * the "args". */ extern void kernel_thread_helper(void); @@ -208,8 +208,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) memset(®s, 0, sizeof(regs)); - regs.bx = (unsigned long) fn; - regs.dx = (unsigned long) arg; + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; regs.ds = __USER_DS; regs.es = __USER_DS; -- cgit v1.2.2 From f443ff4201dd25cd4dec183f9919ecba90c8edc2 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:43 -0500 Subject: x86: Sync 32/64-bit kernel_thread Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-5-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_32.c | 5 ++++- arch/x86/kernel/process_64.c | 11 +++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd874d2b6ab1..f2e8b05a4f02 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -211,14 +211,17 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.si = (unsigned long) fn; regs.di = (unsigned long) arg; +#ifdef CONFIG_X86_32 regs.ds = __USER_DS; regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; regs.gs = __KERNEL_STACK_CANARY; +#endif + regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 00ac66fa5c6b..d49a9094f6f3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -248,10 +248,17 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.si = (unsigned long) fn; regs.di = (unsigned long) arg; +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#endif + regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS; - regs.flags = X86_EFLAGS_IF; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -- cgit v1.2.2 From df59e7bf439918f523ac29e996ec1eebbed60440 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:44 -0500 Subject: x86: Merge kernel_thread() Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-6-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 36 ------------------------------------ arch/x86/kernel/process_64.c | 36 ------------------------------------ 3 files changed, 35 insertions(+), 72 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f3c1a6b3a65e..8705ccedd447 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -243,6 +243,41 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#endif + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); /* * sys_execve() executes a new program. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f2e8b05a4f02..ccf234266a2e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -192,42 +192,6 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, ®s->sp, regs->bp); } -/* - * This gets run with %si containing the - * function to call, and %di containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.si = (unsigned long) fn; - regs.di = (unsigned long) arg; - -#ifdef CONFIG_X86_32 - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; -#endif - - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d49a9094f6f3..1a362c5bec37 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -229,42 +229,6 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } -/* - * This gets run with %si containing the - * function to call, and %di containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.si = (unsigned long) fn; - regs.di = (unsigned long) arg; - -#ifdef CONFIG_X86_32 - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; -#endif - - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { -- cgit v1.2.2 From ebb682f522411abbe358059a256a8672ec0bd55b Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 9 Dec 2009 13:36:45 -0500 Subject: x86, AMD: Fix stale cpuid4_info shared_map data in shared_cpu_map cpumasks The per_cpu cpuid4_info shared_map can contain stale data when CPUs are added and removed. The stale data can lead to a NULL pointer derefernce panic on a remove of a CPU that has had siblings previously removed. This patch resolves the panic by verifying a cpu is actually online before adding it to the shared_cpu_map, only examining cpus that are part of the same lower level cache, and by updating other siblings lowest level cache maps when a cpu is added. Signed-off-by: Prarit Bhargava LKML-Reference: <20091209183336.17855.98708.sendpatchset@prarit.bos.redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6c40f6b5b340..63ada177b40c 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -507,18 +507,19 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; unsigned long num_threads_sharing; - int index_msb, i; + int index_msb, i, sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - struct cpuinfo_x86 *d; - for_each_online_cpu(i) { + for_each_cpu(i, c->llc_shared_map) { if (!per_cpu(cpuid4_info, i)) continue; - d = &cpu_data(i); this_leaf = CPUID4_INFO_IDX(i, index); - cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), - d->llc_shared_map); + for_each_cpu(sibling, c->llc_shared_map) { + if (!cpu_online(sibling)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } } return; } -- cgit v1.2.2 From 893f38d144a4d96d2483cd7c3801d26e1b2c23e9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 10 Dec 2009 13:07:22 -0800 Subject: x86: Use find_e820() instead of hard coded trampoline address Jens found the following crash/regression: [ 0.000000] found SMP MP-table at [ffff8800000fdd80] fdd80 [ 0.000000] Kernel panic - not syncing: Overlapping early reservations 12-f011 MP-table mpc to 0-fff BIOS data page and [ 0.000000] Kernel panic - not syncing: Overlapping early reservations 12-f011 MP-table mpc to 6000-7fff TRAMPOLINE and bisected it to b24c2a9 ("x86: Move find_smp_config() earlier and avoid bootmem usage"). It turns out the BIOS is using the first 64k for mptable, without reserving it. So try to find good range for the real-mode trampoline instead of hard coding it, in case some bios tries to use that range for sth. Reported-by: Jens Axboe Signed-off-by: Yinghai Lu Tested-by: Jens Axboe Cc: Randy Dunlap LKML-Reference: <4B21630A.6000308@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trampoline.h | 1 - arch/x86/kernel/e820.c | 11 ++++++++++- arch/x86/kernel/head32.c | 2 -- arch/x86/kernel/head64.c | 2 -- arch/x86/kernel/mpparse.c | 3 --- arch/x86/kernel/setup.c | 13 ++++++++----- arch/x86/kernel/trampoline.c | 20 +++++++++----------- 7 files changed, 27 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index 90f06c25221d..cb507bb05d79 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -16,7 +16,6 @@ extern unsigned long initial_code; extern unsigned long initial_gs; #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) -#define TRAMPOLINE_BASE 0x6000 extern unsigned long setup_trampoline(void); extern void __init reserve_trampoline_memory(void); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d17d482a04f4..f50447d961c0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -732,7 +732,16 @@ struct early_res { char overlap_ok; }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ + { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ +#ifdef CONFIG_X86_32 + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 }, +#endif + {} }; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 4f8e2507e8f3..5051b94c9069 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,8 +29,6 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 0b06cd778fd9..b5a9896ca1e7 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 35a57c963df9..40b54ceb68b5 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -945,9 +945,6 @@ void __init early_reserve_e820_mpc_new(void) { if (enable_update_mptable && alloc_mptable) { u64 startt = 0; -#ifdef CONFIG_X86_TRAMPOLINE - startt = TRAMPOLINE_BASE; -#endif mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 946a311a25c9..f7b8b9894b22 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -73,6 +73,7 @@ #include #include +#include #include #include #include @@ -875,6 +876,13 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + + reserve_trampoline_memory(); + #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -921,11 +929,6 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); - #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index cd022121cab6..c652ef62742d 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -12,21 +12,19 @@ #endif /* ready for x86_64 and x86 */ -unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base; void __init reserve_trampoline_memory(void) { -#ifdef CONFIG_X86_32 - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); -#endif + unsigned long mem; + /* Has to be in very low memory so we can execute real-mode AP code. */ - reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, - "TRAMPOLINE"); + mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); + if (mem == -1L) + panic("Cannot allocate trampoline\n"); + + trampoline_base = __va(mem); + reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); } /* -- cgit v1.2.2 From f8b7256096a20436f6d0926747e3ac3d64c81d24 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Nov 2009 17:37:04 -0500 Subject: Unify sys_mmap* New helper - sys_mmap_pgoff(); switch syscalls to using it. Acked-by: David S. Miller Signed-off-by: Al Viro --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/ia32/sys_ia32.c | 43 +------------------------------------- arch/x86/include/asm/sys_ia32.h | 3 --- arch/x86/include/asm/syscalls.h | 2 -- arch/x86/kernel/sys_i386_32.c | 27 +----------------------- arch/x86/kernel/sys_x86_64.c | 17 +-------------- arch/x86/kernel/syscall_table_32.S | 2 +- 7 files changed, 5 insertions(+), 91 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 4eefdca9832b..53147ad85b96 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -696,7 +696,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* streams2 */ .quad stub32_vfork /* 190 */ .quad compat_sys_getrlimit - .quad sys32_mmap2 + .quad sys_mmap_pgoff .quad sys32_truncate64 .quad sys32_ftruncate64 .quad sys32_stat64 /* 195 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index df82c0e48ded..422572c77923 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -155,9 +155,6 @@ struct mmap_arg_struct { asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) { struct mmap_arg_struct a; - struct file *file = NULL; - unsigned long retval; - struct mm_struct *mm ; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; @@ -165,22 +162,8 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) return -EINVAL; - if (!(a.flags & MAP_ANONYMOUS)) { - file = fget(a.fd); - if (!file) - return -EBADF; - } - - mm = current->mm; - down_write(&mm->mmap_sem); - retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset>>PAGE_SHIFT); - if (file) - fput(file); - - up_write(&mm->mmap_sem); - - return retval; } asmlinkage long sys32_mprotect(unsigned long start, size_t len, @@ -483,30 +466,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct mm_struct *mm = current->mm; - unsigned long error; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); - return error; -} - asmlinkage long sys32_olduname(struct oldold_utsname __user *name) { char *arch = "x86_64"; diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 9af9decb38c3..4a5a089e1c62 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -57,9 +57,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); - struct oldold_utsname; struct old_utsname; asmlinkage long sys32_olduname(struct oldold_utsname __user *); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 372b76edd63f..1bb6e395881c 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -55,8 +55,6 @@ struct sel_arg_struct; struct oldold_utsname; struct old_utsname; -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); asmlinkage int old_mmap(struct mmap_arg_struct __user *); asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 1884a8d12bfa..dee1ff7cba58 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,31 +24,6 @@ #include -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - struct mm_struct *mm = current->mm; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) goto out; - err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return err; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 45e00eb09c3a..8aa2057efd12 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, fd, unsigned long, off) { long error; - struct file *file; - error = -EINVAL; if (off & ~PAGE_MASK) goto out; - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 70c2125d55b9..15228b5d3eb7 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -191,7 +191,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long ptregs_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ -- cgit v1.2.2 From a5d09d68335bb8422d5e7050c9f03f99ba6cfebd Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Fri, 11 Dec 2009 08:43:12 -0600 Subject: kgdb,x86: remove redundant test The for loop starts with a breakno of 0, and ends when it's 4. so this test is always true. Signed-off-by: Roel Kluin Signed-off-by: Andrew Morton Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 20a5b3689463..f93d015753ce 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -220,8 +220,7 @@ static void kgdb_correct_hw_break(void) dr7 |= ((breakinfo[breakno].len << 2) | breakinfo[breakno].type) << ((breakno << 2) + 16); - if (breakno >= 0 && breakno <= 3) - set_debugreg(breakinfo[breakno].addr, breakno); + set_debugreg(breakinfo[breakno].addr, breakno); } else { if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { -- cgit v1.2.2 From cf6f196d112a6f6757b1ca3cce0b576f7abee479 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:16 -0600 Subject: kgdb,i386: Fix corner case access to ss with NMI watch dog exception It is possible for the user_mode_vm(regs) check to return true on the i368 arch for a non master kgdb cpu or when the master kgdb cpu handles the NMI watch dog exception. The solution is simply to select the correct gdb_ss location based on the check to user_mode_vm(regs). CC: Ingo Molnar Acked-by: H. Peter Anvin Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f93d015753ce..aefae46aa646 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -86,9 +86,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; + if (user_mode_vm(regs)) { + gdb_regs[GDB_SS] = regs->ss; + gdb_regs[GDB_SP] = regs->sp; + } else { + gdb_regs[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + } #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +107,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; -#endif gdb_regs[GDB_SP] = kernel_stack_pointer(regs); +#endif } /** -- cgit v1.2.2 From 8097551d9ab9b9e3630694ad1bc6e12c597c515e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:18 -0600 Subject: kgdb,x86: do not set kgdb_single_step on x86 On an SMP system the kgdb_single_step flag has the possibility to indefinitely hang the system in the case. Consider the case where, CPU 1 has the schedule lock and CPU 0 is set to single step, there is no way for CPU 0 to run another task. The easy way to observe the problem is to make 2 cpus busy, and run the kgdb test suite. You will see that it hangs the system very quickly. while [ 1 ] ; do find /proc > /dev/null 2>&1 ; done & while [ 1 ] ; do find /proc > /dev/null 2>&1 ; done & echo V1 > /sys/module/kgdbts/parameters/kgdbts The side effect of this patch is that there is the possibility to miss a breakpoint in the case that a single step operation was executed to step over a breakpoint in common code. The trade off of the missed breakpoint is preferred to hanging the kernel. This can be fixed in the future by using kprobes or another strategy to step over planted breakpoints with out of line execution. CC: Ingo Molnar Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index aefae46aa646..dd74fe7273b1 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -400,7 +400,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, /* set the trace bit if we're stepping */ if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; - kgdb_single_step = 1; atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id()); } -- cgit v1.2.2 From 505422517d3f126bb939439e9d15dece94e11d2c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 11 Dec 2009 18:14:40 +0100 Subject: x86, msr: Add support for non-contiguous cpumasks The current rd/wrmsr_on_cpus helpers assume that the supplied cpumasks are contiguous. However, there are machines out there like some K8 multinode Opterons which have a non-contiguous core enumeration on each node (e.g. cores 0,2 on node 0 instead of 0,1), see http://www.gossamer-threads.com/lists/linux/kernel/1160268. This patch fixes out-of-bounds writes (see URL above) by adding per-CPU msr structs which are used on the respective cores. Additionally, two helpers, msrs_{alloc,free}, are provided for use by the callers of the MSR accessors. Cc: H. Peter Anvin Cc: Mauro Carvalho Chehab Cc: Aristeu Rozanski Cc: Randy Dunlap Cc: Doug Thompson Signed-off-by: Borislav Petkov LKML-Reference: <20091211171440.GD31998@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 3 +++ arch/x86/lib/msr.c | 26 ++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 5bef931f8b14..2d228fc9b4b7 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -244,6 +244,9 @@ do { \ #define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) +struct msr *msrs_alloc(void); +void msrs_free(struct msr *msrs); + #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 41628b104b9e..872834177937 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -7,7 +7,6 @@ struct msr_info { u32 msr_no; struct msr reg; struct msr *msrs; - int off; int err; }; @@ -18,7 +17,7 @@ static void __rdmsr_on_cpu(void *info) int this_cpu = raw_smp_processor_id(); if (rv->msrs) - reg = &rv->msrs[this_cpu - rv->off]; + reg = per_cpu_ptr(rv->msrs, this_cpu); else reg = &rv->reg; @@ -32,7 +31,7 @@ static void __wrmsr_on_cpu(void *info) int this_cpu = raw_smp_processor_id(); if (rv->msrs) - reg = &rv->msrs[this_cpu - rv->off]; + reg = per_cpu_ptr(rv->msrs, this_cpu); else reg = &rv->reg; @@ -80,7 +79,6 @@ static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, memset(&rv, 0, sizeof(rv)); - rv.off = cpumask_first(mask); rv.msrs = msrs; rv.msr_no = msr_no; @@ -120,6 +118,26 @@ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) } EXPORT_SYMBOL(wrmsr_on_cpus); +struct msr *msrs_alloc(void) +{ + struct msr *msrs = NULL; + + msrs = alloc_percpu(struct msr); + if (!msrs) { + pr_warning("%s: error allocating msrs\n", __func__); + return NULL; + } + + return msrs; +} +EXPORT_SYMBOL(msrs_alloc); + +void msrs_free(struct msr *msrs) +{ + free_percpu(msrs); +} +EXPORT_SYMBOL(msrs_free); + /* These "safe" variants are slower and should be used when the target MSR may not actually exist. */ static void __rdmsr_safe_on_cpu(void *info) -- cgit v1.2.2 From 450b1e8dd10f41b5adad73f48ce8f6707d17c5c4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 11 Dec 2009 08:08:50 -0800 Subject: x86: Remove enabling x2apic message for every CPU Print only once that the system is supporting x2apic mode. Signed-off-by: Mike Travis Acked-by: Cyrill Gorcunov LKML-Reference: <4B226E92.5080904@sgi.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index efb2b9cd132c..aa57c079c98f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1341,7 +1341,7 @@ void enable_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { - pr_info("Enabling x2apic\n"); + printk_once(KERN_INFO "Enabling x2apic\n"); wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } -- cgit v1.2.2 From 2eaad1fddd7450a48ad464229775f97fbfe8af36 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 10 Dec 2009 17:19:36 -0800 Subject: x86: Limit the number of processor bootup messages When there are a large number of processors in a system, there is an excessive amount of messages sent to the system console. It's estimated that with 4096 processors in a system, and the console baudrate set to 56K, the startup messages will take about 84 minutes to clear the serial port. This set of patches limits the number of repetitious messages which contain no additional information. Much of this information is obtainable from the /proc and /sysfs. Some of the messages are also sent to the kernel log buffer as KERN_DEBUG messages so dmesg can be used to examine more closely any details specific to a problem. The new cpu bootup sequence for system_state == SYSTEM_BOOTING: Booting Node 0, Processors #1 #2 #3 #4 #5 #6 #7 Ok. Booting Node 1, Processors #8 #9 #10 #11 #12 #13 #14 #15 Ok. ... Booting Node 3, Processors #56 #57 #58 #59 #60 #61 #62 #63 Ok. Brought up 64 CPUs After the system is running, a single line boot message is displayed when CPU's are hotplugged on: Booting Node %d Processor %d APIC 0x%x Status of the following lines: CPU: Physical Processor ID: printed once (for boot cpu) CPU: Processor Core ID: printed once (for boot cpu) CPU: Hyper-Threading is disabled printed once (for boot cpu) CPU: Thermal monitoring enabled printed once (for boot cpu) CPU %d/0x%x -> Node %d: removed CPU %d is now offline: only if system_state == RUNNING Initializing CPU#%d: KERN_DEBUG Signed-off-by: Mike Travis LKML-Reference: <4B219E28.8080601@sgi.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 15 ++++++---- arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/cpu/common.c | 8 ++++-- arch/x86/kernel/cpu/intel.c | 2 -- arch/x86/kernel/cpu/mcheck/therm_throt.c | 4 +-- arch/x86/kernel/smpboot.c | 45 ++++++++++++++++++++---------- 6 files changed, 47 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index c965e5212714..468489b57aae 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -74,6 +74,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; + static bool printed; if (c->cpuid_level < 0xb) return; @@ -127,12 +128,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); + if (!printed) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } return; #endif } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7128b3799cec..8dc3ea145c97 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -375,8 +375,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) node = nearby_node(apicid); } numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1afa990a6c8..0ee9a3254eec 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -427,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; int index_msb, core_bits; + static bool printed; if (!cpu_has(c, X86_FEATURE_HT)) return; @@ -442,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); goto out; } @@ -469,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) ((1 << core_bits) - 1); out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { + if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printed = 1; } #endif } @@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void) if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) panic("CPU#%d already initialized!\n", cpu); - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + pr_debug("Initializing CPU#%d\n", cpu); clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c900b73f9224..9c31e8b09d2c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -270,8 +270,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) node = cpu_to_node(cpu); } numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4fef985fc221..1003ed4bbce4 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -339,8 +339,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); + printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 29e6744f51e3..678d0b8c26f3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -671,6 +671,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +/* reduce the number of lines printed when booting a large cpu count system */ +static void __cpuinit announce_cpu(int cpu, int apicid) +{ + static int current_node = -1; + int node = cpu_to_node(cpu); + + if (system_state == SYSTEM_BOOTING) { + if (node != current_node) { + if (current_node > (-1)) + pr_cont(" Ok.\n"); + current_node = node; + pr_info("Booting Node %3d, Processors ", node); + } + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + return; + } else + pr_info("Booting Node %d Processor %d APIC 0x%x\n", + node, cpu, apicid); +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -737,9 +757,8 @@ do_rest: /* start_ip had better be page-aligned! */ start_ip = setup_trampoline(); - /* So we see what's up */ - printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", - cpu, apicid, start_ip); + /* So we see what's up */ + announce_cpu(cpu, apicid); /* * This grunge runs the startup process for @@ -788,21 +807,17 @@ do_rest: udelay(100); } - if (cpumask_test_cpu(cpu, cpu_callin_mask)) { - /* number CPUs logically, starting from 1 (BSP is 0) */ - pr_debug("OK.\n"); - printk(KERN_INFO "CPU%d: ", cpu); - print_cpu_info(&cpu_data(cpu)); - pr_debug("CPU has booted.\n"); - } else { + if (cpumask_test_cpu(cpu, cpu_callin_mask)) + pr_debug("CPU%d: has booted.\n", cpu); + else { boot_error = 1; if (*((volatile unsigned char *)trampoline_base) == 0xA5) /* trampoline started but...? */ - printk(KERN_ERR "Stuck ??\n"); + pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - printk(KERN_ERR "Not responding.\n"); + pr_err("CPU%d: Not responding.\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -1293,14 +1308,16 @@ void native_cpu_die(unsigned int cpu) for (i = 0; i < 10; i++) { /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { - printk(KERN_INFO "CPU %d is now offline\n", cpu); + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); + if (1 == num_online_cpus()) alternatives_smp_switch(0); return; } msleep(100); } - printk(KERN_ERR "CPU %u didn't die...\n", cpu); + pr_err("CPU %u didn't die...\n", cpu); } void play_dead_common(void) -- cgit v1.2.2 From 559df2e0210352f83926d178c40c51142292a18c Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 19 Apr 2009 22:35:10 +0200 Subject: kbuild: move asm-offsets.h to include/generated The simplest method was to add an extra asm-offsets.h file in arch/$ARCH/include/asm that references the generated file. We can now migrate the architectures one-by-one to reference the generated file direct - and when done we can delete the temporary arch/$ARCH/include/asm/asm-offsets.h file. Signed-off-by: Sam Ravnborg Cc: Al Viro Signed-off-by: Michal Marek --- arch/x86/include/asm/asm-offsets.h | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/x86/include/asm/asm-offsets.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm-offsets.h b/arch/x86/include/asm/asm-offsets.h new file mode 100644 index 000000000000..d370ee36a182 --- /dev/null +++ b/arch/x86/include/asm/asm-offsets.h @@ -0,0 +1 @@ +#include -- cgit v1.2.2 From 92045954058671fdd0ccf031ca06611ce1d929d1 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 18 Oct 2009 00:36:47 +0200 Subject: kbuild: move compile.h to include/generated Signed-off-by: Sam Ravnborg Signed-off-by: Michal Marek --- arch/x86/boot/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c index 2723d9b5ce43..4d88763e39cb 100644 --- a/arch/x86/boot/version.c +++ b/arch/x86/boot/version.c @@ -14,7 +14,7 @@ #include "boot.h" #include -#include +#include const char kernel_version[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") " -- cgit v1.2.2 From 273b281fa22c293963ee3e6eec418f5dda2dbc83 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 18 Oct 2009 00:52:28 +0200 Subject: kbuild: move utsrelease.h to include/generated Fix up all users of utsrelease.h Signed-off-by: Sam Ravnborg Signed-off-by: Michal Marek --- arch/x86/boot/header.S | 2 +- arch/x86/boot/version.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b31cc54b4641..93e689f4bd86 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -16,7 +16,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c index 4d88763e39cb..2b15aa488ffb 100644 --- a/arch/x86/boot/version.c +++ b/arch/x86/boot/version.c @@ -13,7 +13,7 @@ */ #include "boot.h" -#include +#include #include const char kernel_version[] = -- cgit v1.2.2 From 1d865fb728bd6bbcdfbd6ec1e2b8ade3b4805641 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 11 Dec 2009 11:36:18 -0600 Subject: x86: Fix duplicated UV BAU interrupt vector Interrupt vector 0xec has been doubly defined in irq_vectors.h It seems arbitrary whether LOCAL_PENDING_VECTOR or UV_BAU_MESSAGE is the higher number. As long as they are unique. If they are not unique we'll hit a BUG in alloc_system_vector(). Signed-off-by: Cliff Wickman Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6a635bd39867..4611f085cd43 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -113,7 +113,7 @@ */ #define LOCAL_PENDING_VECTOR 0xec -#define UV_BAU_MESSAGE 0xec +#define UV_BAU_MESSAGE 0xea /* * Self IPI vector for machine checks -- cgit v1.2.2 From eba11d6da7a983cedb0acf32a38e4d0daa8b5d0e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 13 Dec 2009 23:24:03 -0800 Subject: x86: Fix build warning in arch/x86/mm/mmio-mod.c Stephen Rothwell reported these warnings: arch/x86/mm/mmio-mod.c: In function 'print_pte': arch/x86/mm/mmio-mod.c:100: warning: too many arguments for format arch/x86/mm/mmio-mod.c:106: warning: too many arguments for format The 'fmt' was left out accidentally. Reported-by: Stephen Rothwell Signed-off-by: Joe Perches Cc: Peter Zijlstra Cc: Linus LKML-Reference: <1260775443.18538.16.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 4c765e9c4664..34a3291ca103 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -20,7 +20,7 @@ * Derived from the read-mod example from relay-examples by Tom Zanussi. */ -#define pr_fmt(fmt) "mmiotrace: " +#define pr_fmt(fmt) "mmiotrace: " fmt #define DEBUG 1 -- cgit v1.2.2 From f4780ca005404166cc40af77ef0e86132ab98a81 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 14 Dec 2009 11:52:14 +0900 Subject: x86: Move swiotlb initialization before dma32_free_bootmem The commit 75f1cdf1dda92cae037ec848ae63690d91913eac introduced a bug that we initialize SWIOTLB right after dma32_free_bootmem so we wrongly steal memory area allocated for GART with broken BIOS earlier. This moves swiotlb initialization before dma32_free_bootmem(). Signed-off-by: FUJITA Tomonori Cc: yinghai@kernel.org LKML-Reference: <1260759135-6450-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index afcc58b69c7c..fcc2f2bfa39c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -120,11 +120,14 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { + int use_swiotlb; + + use_swiotlb = pci_swiotlb_init(); #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif - if (pci_swiotlb_init()) + if (use_swiotlb) return; gart_iommu_hole_init(); -- cgit v1.2.2 From f3eee54276dfd1117fd94259f2b4a38388264724 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 14 Dec 2009 11:52:15 +0900 Subject: x86: Gart: fix breakage due to IOMMU initialization cleanup This fixes the following breakage of the commit 75f1cdf1dda92cae037ec848ae63690d91913eac: - GART systems that don't AGP with broken BIOS and more than 4GB memory are forced to use swiotlb. They can allocate aperture by hand and use GART. - GART systems without GAP must disable GART on shutdown. - swiotlb usage is forced by the boot option, gart_iommu_hole_init() is not called, so we disable GART early_gart_iommu_check(). Signed-off-by: Yinghai Lu Signed-off-by: FUJITA Tomonori LKML-Reference: <1260759135-6450-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 11 ++++++----- arch/x86/kernel/pci-gart_64.c | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index e0dfb6856aa2..3704997e8b25 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -280,7 +280,8 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - int i, fix, slot; + u32 agp_aper_base = 0, agp_aper_order = 0; + int i, fix, slot, valid_agp = 0; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; u64 aper_base = 0, last_aper_base = 0; @@ -290,6 +291,8 @@ void __init early_gart_iommu_check(void) return; /* This is mostly duplicate of iommu_hole_init */ + agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; @@ -342,10 +345,10 @@ void __init early_gart_iommu_check(void) } } - if (!fix) + if (valid_agp) return; - /* different nodes have different setting, disable them all at first*/ + /* disable them all at first */ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; int dev_base, dev_limit; @@ -458,8 +461,6 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (!valid_agp) { - /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || valid_agp || diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index e6a0d402f171..56c0e730d3fe 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -710,7 +710,8 @@ static void gart_iommu_shutdown(void) struct pci_dev *dev; int i; - if (no_agp) + /* don't shutdown it if there is AGP installed */ + if (!no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { -- cgit v1.2.2 From 485a2e1973fd9f98c2c6776e66ac4721882b69e0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 14 Dec 2009 17:56:34 +0900 Subject: x86, mce: Thermal monitoring depends on APIC being enabled Add check if APIC is not disabled since thermal monitoring depends on it. As only apic gets disabled we should not try to install "thermal monitor" vector, print out that thermal monitoring is enabled and etc... Note that "Intel Correct Machine Check Interrupts" already has such a check. Also I decided to not add cpu_has_apic check into mcheck_intel_therm_init since even if it'll call apic_read on disabled apic -- it's safe here and allow us to save a few code bytes. Reported-by: Thomas Gleixner Signed-off-by: Cyrill Gorcunov Signed-off-by: Hidetoshi Seto LKML-Reference: <4B25FDC2.3020401@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1003ed4bbce4..0a9b57702be4 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -274,8 +274,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c) int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + /* Thermal monitoring depends on APIC, ACPI and clock modulation */ + if (!cpu_has_apic || !cpu_has(c, X86_FEATURE_ACPI) || + !cpu_has(c, X86_FEATURE_ACC)) return; /* -- cgit v1.2.2 From 70fe440718d9f42bf963c2cffe12008eb5556165 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 14 Dec 2009 17:57:00 +0900 Subject: x86, mce: Clean up thermal init by introducing intel_thermal_supported() It looks better to have a common function. No change in functionality. Signed-off-by: Hidetoshi Seto Cc: Cyrill Gorcunov LKML-Reference: <4B25FDDC.407@jp.fujitsu.com> Signed-off-by: Ingo Molnar Cc: Cyrill Gorcunov --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 0a9b57702be4..81c499eceb21 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -256,6 +256,16 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +/* Thermal monitoring depends on APIC, ACPI and clock modulation */ +static int intel_thermal_supported(struct cpuinfo_x86 *c) +{ + if (!cpu_has_apic) + return 0; + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return 0; + return 1; +} + void __init mcheck_intel_therm_init(void) { /* @@ -263,8 +273,7 @@ void __init mcheck_intel_therm_init(void) * LVT value on BSP and use that value to restore APs' thermal LVT * entry BIOS programmed later */ - if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && - cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + if (intel_thermal_supported(&boot_cpu_data)) lvtthmr_init = apic_read(APIC_LVTTHMR); } @@ -274,9 +283,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c) int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on APIC, ACPI and clock modulation */ - if (!cpu_has_apic || !cpu_has(c, X86_FEATURE_ACPI) || - !cpu_has(c, X86_FEATURE_ACC)) + if (!intel_thermal_supported(c)) return; /* -- cgit v1.2.2 From 494c2ebfb287eb10b229415063099e3700639028 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 14 Dec 2009 10:02:18 -0800 Subject: x86, msr: Remove incorrect, duplicated code in the MSR driver The MSR driver would compute the values for cpu and c at declaration, and then again in the body of the function. This isn't merely redundant, but unsafe, since cpu might not refer to a valid CPU at that point. Remove the unnecessary and dangerous references in the declarations. This code now matches the equivalent code in the CPUID driver. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/msr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 553449951b84..572b07eee3f4 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -172,11 +172,10 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) static int msr_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int cpu; + struct cpuinfo_x86 *c; cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ -- cgit v1.2.2 From 873b5271f878a11729fb4602c6ce967d0ff81119 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 14 Dec 2009 13:55:20 -0800 Subject: x86: Regex support and known-movable symbols for relocs, fix _end This adds a new category of symbols to the relocs program: symbols which are known to be relative, even though the linker emits them as absolute; this is the case for symbols that live in the linker script, which currently applies to _end. Unfortunately the previous workaround of putting _end in its own empty section was defeated by newer binutils, which remove empty sections completely. This patch also changes the symbol matching to use regular expressions instead of hardcoded C for specific patterns. This is a decidedly non-minimal patch: a modified version of the relocs program is used as part of the Syslinux build, and this is basically a backport to Linux of some of those changes; they have thus been well tested. Signed-off-by: H. Peter Anvin LKML-Reference: <4AF86211.3070103@zytor.com> Acked-by: Michal Marek Tested-by: Sedat Dilek --- arch/x86/boot/compressed/relocs.c | 87 ++++++++++++++++++++++++++------------- arch/x86/kernel/vmlinux.lds.S | 4 +- 2 files changed, 60 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index bbeb0c3fbd90..89bbf4e4d05d 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -9,6 +9,9 @@ #include #define USE_BSD #include +#include + +static void die(char *fmt, ...); #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) static Elf32_Ehdr ehdr; @@ -30,25 +33,47 @@ static struct section *secs; * the address for which it has been compiled. Don't warn user about * absolute relocations present w.r.t these symbols. */ -static const char* safe_abs_relocs[] = { - "xen_irq_disable_direct_reloc", - "xen_save_fl_direct_reloc", -}; +static const char abs_sym_regex[] = + "^(xen_irq_disable_direct_reloc$|" + "xen_save_fl_direct_reloc$|" + "VDSO|" + "__crc_)"; +static regex_t abs_sym_regex_c; +static int is_abs_reloc(const char *sym_name) +{ + return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0); +} -static int is_safe_abs_reloc(const char* sym_name) +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ +static const char rel_sym_regex[] = + "^_end$"; +static regex_t rel_sym_regex_c; +static int is_rel_reloc(const char *sym_name) { - int i; + return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0); +} - for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { - if (!strcmp(sym_name, safe_abs_relocs[i])) - /* Match found */ - return 1; - } - if (strncmp(sym_name, "VDSO", 4) == 0) - return 1; - if (strncmp(sym_name, "__crc_", 6) == 0) - return 1; - return 0; +static void regex_init(void) +{ + char errbuf[128]; + int err; + + err = regcomp(&abs_sym_regex_c, abs_sym_regex, + REG_EXTENDED|REG_NOSUB); + if (err) { + regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf); + die("%s", errbuf); + } + + err = regcomp(&rel_sym_regex_c, rel_sym_regex, + REG_EXTENDED|REG_NOSUB); + if (err) { + regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf); + die("%s", errbuf); + } } static void die(char *fmt, ...) @@ -131,7 +156,7 @@ static const char *rel_type(unsigned type) #undef REL_TYPE }; const char *name = "unknown type rel type name"; - if (type < ARRAY_SIZE(type_name)) { + if (type < ARRAY_SIZE(type_name) && type_name[type]) { name = type_name[type]; } return name; @@ -448,7 +473,7 @@ static void print_absolute_relocs(void) * Before warning check if this absolute symbol * relocation is harmless. */ - if (is_safe_abs_reloc(name)) + if (is_abs_reloc(name) || is_rel_reloc(name)) continue; if (!printed) { @@ -501,21 +526,26 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; r_type = ELF32_R_TYPE(rel->r_info); /* Don't visit relocations to absolute symbols */ - if (sym->st_shndx == SHN_ABS) { + if (sym->st_shndx == SHN_ABS && + !is_rel_reloc(sym_name(sym_strtab, sym))) { continue; } - if (r_type == R_386_NONE || r_type == R_386_PC32) { + switch (r_type) { + case R_386_NONE: + case R_386_PC32: /* * NONE can be ignored and and PC relative * relocations don't need to be adjusted. */ - } - else if (r_type == R_386_32) { + break; + case R_386_32: /* Visit relocations that need to be adjusted */ visit(rel, sym); - } - else { - die("Unsupported relocation type: %d\n", r_type); + break; + default: + die("Unsupported relocation type: %s (%d)\n", + rel_type(r_type), r_type); + break; } } } @@ -571,16 +601,15 @@ static void emit_relocs(int as_text) } else { unsigned char buf[4]; - buf[0] = buf[1] = buf[2] = buf[3] = 0; /* Print a stop */ - printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); + fwrite("\0\0\0\0", 4, 1, stdout); /* Now print each relocation */ for (i = 0; i < reloc_count; i++) { buf[0] = (relocs[i] >> 0) & 0xff; buf[1] = (relocs[i] >> 8) & 0xff; buf[2] = (relocs[i] >> 16) & 0xff; buf[3] = (relocs[i] >> 24) & 0xff; - printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); + fwrite(buf, 4, 1, stdout); } } } @@ -598,6 +627,8 @@ int main(int argc, char **argv) FILE *fp; int i; + regex_init(); + show_absolute_syms = 0; show_absolute_relocs = 0; as_text = 0; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f3f2104408d9..f92a0da608cb 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -319,9 +319,7 @@ SECTIONS __brk_limit = .; } - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = .; - } + _end = .; STABS_DEBUG DWARF_DEBUG -- cgit v1.2.2 From 445c89514be242b1b0080056d50bdc1b72adeb5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Dec 2009 19:49:50 +0100 Subject: locking: Convert raw_spinlock to arch_spinlock The raw_spin* namespace was taken by lockdep for the architecture specific implementations. raw_spin_* would be the ideal name space for the spinlocks which are not converted to sleeping locks in preempt-rt. Linus suggested to convert the raw_ to arch_ locks and cleanup the name space instead of using an artifical name like core_spin, atomic_spin or whatever No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- arch/x86/include/asm/paravirt.h | 12 ++++++------ arch/x86/include/asm/paravirt_types.h | 14 +++++++------- arch/x86/include/asm/spinlock.h | 30 +++++++++++++++--------------- arch/x86/include/asm/spinlock_types.h | 4 ++-- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/paravirt-spinlocks.c | 2 +- arch/x86/kernel/tsc_sync.c | 2 +- arch/x86/xen/spinlock.c | 16 ++++++++-------- 8 files changed, 41 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index efb38994859c..5655f75f10b7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -731,34 +731,34 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int __raw_spin_is_locked(struct raw_spinlock *lock) +static inline int __raw_spin_is_locked(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); } -static inline int __raw_spin_is_contended(struct raw_spinlock *lock) +static inline int __raw_spin_is_contended(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); } #define __raw_spin_is_contended __raw_spin_is_contended -static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) +static __always_inline void __raw_spin_lock(struct arch_spinlock *lock) { PVOP_VCALL1(pv_lock_ops.spin_lock, lock); } -static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock, +static __always_inline void __raw_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) { PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); } -static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) +static __always_inline int __raw_spin_trylock(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); } -static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) +static __always_inline void __raw_spin_unlock(struct arch_spinlock *lock) { PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); } diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 9357473c8da0..b1e70d51e40c 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -318,14 +318,14 @@ struct pv_mmu_ops { phys_addr_t phys, pgprot_t flags); }; -struct raw_spinlock; +struct arch_spinlock; struct pv_lock_ops { - int (*spin_is_locked)(struct raw_spinlock *lock); - int (*spin_is_contended)(struct raw_spinlock *lock); - void (*spin_lock)(struct raw_spinlock *lock); - void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct raw_spinlock *lock); - void (*spin_unlock)(struct raw_spinlock *lock); + int (*spin_is_locked)(struct arch_spinlock *lock); + int (*spin_is_contended)(struct arch_spinlock *lock); + void (*spin_lock)(struct arch_spinlock *lock); + void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags); + int (*spin_trylock)(struct arch_spinlock *lock); + void (*spin_unlock)(struct arch_spinlock *lock); }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 4e77853321db..204b524fcf57 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -58,7 +58,7 @@ #if (NR_CPUS < 256) #define TICKET_SHIFT 8 -static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { short inc = 0x0100; @@ -77,7 +77,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) : "memory", "cc"); } -static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { int tmp, new; @@ -96,7 +96,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) return tmp; } -static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incb %0" : "+m" (lock->slock) @@ -106,7 +106,7 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) #else #define TICKET_SHIFT 16 -static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { int inc = 0x00010000; int tmp; @@ -127,7 +127,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) : "memory", "cc"); } -static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { int tmp; int new; @@ -149,7 +149,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) return tmp; } -static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incw %0" : "+m" (lock->slock) @@ -158,14 +158,14 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) } #endif -static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) +static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); } -static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) +static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); @@ -174,33 +174,33 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) #ifndef CONFIG_PARAVIRT_SPINLOCKS -static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +static inline int __raw_spin_is_locked(arch_spinlock_t *lock) { return __ticket_spin_is_locked(lock); } -static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +static inline int __raw_spin_is_contended(arch_spinlock_t *lock) { return __ticket_spin_is_contended(lock); } #define __raw_spin_is_contended __raw_spin_is_contended -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +static __always_inline void __raw_spin_lock(arch_spinlock_t *lock) { __ticket_spin_lock(lock); } -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __raw_spin_trylock(arch_spinlock_t *lock) { return __ticket_spin_trylock(lock); } -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __raw_spin_unlock(arch_spinlock_t *lock) { __ticket_spin_unlock(lock); } -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, +static __always_inline void __raw_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { __raw_spin_lock(lock); @@ -208,7 +208,7 @@ static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, #endif /* CONFIG_PARAVIRT_SPINLOCKS */ -static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +static inline void __raw_spin_unlock_wait(arch_spinlock_t *lock) { while (__raw_spin_is_locked(lock)) cpu_relax(); diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 845f81c87091..2ae7637ed524 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -5,9 +5,9 @@ # error "please don't include this file directly" #endif -typedef struct raw_spinlock { +typedef struct arch_spinlock { unsigned int slock; -} raw_spinlock_t; +} arch_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b8ce165dde5d..0862d9d89c92 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -188,7 +188,7 @@ void dump_stack(void) } EXPORT_SYMBOL(dump_stack); -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 3a7c5a44082e..a0f39e090684 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,7 +8,7 @@ #include static inline void -default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { __raw_spin_lock(lock); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index eed156851f5d..9f908b9d1abe 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count; * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata arch_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 36a5141108df..24ded31b5aec 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -120,14 +120,14 @@ struct xen_spinlock { unsigned short spinners; /* count of waiting cpus */ }; -static int xen_spin_is_locked(struct raw_spinlock *lock) +static int xen_spin_is_locked(struct arch_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; return xl->lock != 0; } -static int xen_spin_is_contended(struct raw_spinlock *lock) +static int xen_spin_is_contended(struct arch_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; @@ -136,7 +136,7 @@ static int xen_spin_is_contended(struct raw_spinlock *lock) return xl->spinners != 0; } -static int xen_spin_trylock(struct raw_spinlock *lock) +static int xen_spin_trylock(struct arch_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; u8 old = 1; @@ -181,7 +181,7 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock __get_cpu_var(lock_spinners) = prev; } -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) +static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; struct xen_spinlock *prev; @@ -254,7 +254,7 @@ out: return ret; } -static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) +static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; unsigned timeout; @@ -291,12 +291,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) spin_time_accum_total(start_spin); } -static void xen_spin_lock(struct raw_spinlock *lock) +static void xen_spin_lock(struct arch_spinlock *lock) { __xen_spin_lock(lock, false); } -static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) { __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); } @@ -317,7 +317,7 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) } } -static void xen_spin_unlock(struct raw_spinlock *lock) +static void xen_spin_unlock(struct arch_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; -- cgit v1.2.2 From edc35bd72e2079b25f99c5da7d7a65dbbffc4a26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Dec 2009 12:38:57 +0100 Subject: locking: Rename __RAW_SPIN_LOCK_UNLOCKED to __ARCH_SPIN_LOCK_UNLOCKED Further name space cleanup. No functional change Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- arch/x86/include/asm/spinlock_types.h | 2 +- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/tsc_sync.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 2ae7637ed524..696f8364a4f3 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -9,7 +9,7 @@ typedef struct arch_spinlock { unsigned int slock; } arch_spinlock_t; -#define __RAW_SPIN_LOCK_UNLOCKED { 0 } +#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } typedef struct { unsigned int lock; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0862d9d89c92..5b75afac8a38 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -188,7 +188,7 @@ void dump_stack(void) } EXPORT_SYMBOL(dump_stack); -static arch_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9f908b9d1abe..f1714697a09a 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count; * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata arch_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; -- cgit v1.2.2 From 0199c4e68d1f02894bdefe4b5d9e9ee4aedd8d62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Dec 2009 20:01:25 +0100 Subject: locking: Convert __raw_spin* functions to arch_spin* Name space cleanup. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- arch/x86/include/asm/paravirt.h | 14 +++++++------- arch/x86/include/asm/spinlock.h | 26 +++++++++++++------------- arch/x86/kernel/dumpstack.c | 6 +++--- arch/x86/kernel/paravirt-spinlocks.c | 2 +- arch/x86/kernel/tsc_sync.c | 8 ++++---- 5 files changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5655f75f10b7..dd59a85a918f 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -731,34 +731,34 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int __raw_spin_is_locked(struct arch_spinlock *lock) +static inline int arch_spin_is_locked(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); } -static inline int __raw_spin_is_contended(struct arch_spinlock *lock) +static inline int arch_spin_is_contended(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); } -#define __raw_spin_is_contended __raw_spin_is_contended +#define arch_spin_is_contended arch_spin_is_contended -static __always_inline void __raw_spin_lock(struct arch_spinlock *lock) +static __always_inline void arch_spin_lock(struct arch_spinlock *lock) { PVOP_VCALL1(pv_lock_ops.spin_lock, lock); } -static __always_inline void __raw_spin_lock_flags(struct arch_spinlock *lock, +static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) { PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); } -static __always_inline int __raw_spin_trylock(struct arch_spinlock *lock) +static __always_inline int arch_spin_trylock(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); } -static __always_inline void __raw_spin_unlock(struct arch_spinlock *lock) +static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) { PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); } diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 204b524fcf57..ab9055fd57d9 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -174,43 +174,43 @@ static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) #ifndef CONFIG_PARAVIRT_SPINLOCKS -static inline int __raw_spin_is_locked(arch_spinlock_t *lock) +static inline int arch_spin_is_locked(arch_spinlock_t *lock) { return __ticket_spin_is_locked(lock); } -static inline int __raw_spin_is_contended(arch_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { return __ticket_spin_is_contended(lock); } -#define __raw_spin_is_contended __raw_spin_is_contended +#define arch_spin_is_contended arch_spin_is_contended -static __always_inline void __raw_spin_lock(arch_spinlock_t *lock) +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { __ticket_spin_lock(lock); } -static __always_inline int __raw_spin_trylock(arch_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { return __ticket_spin_trylock(lock); } -static __always_inline void __raw_spin_unlock(arch_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { __ticket_spin_unlock(lock); } -static __always_inline void __raw_spin_lock_flags(arch_spinlock_t *lock, +static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { - __raw_spin_lock(lock); + arch_spin_lock(lock); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ -static inline void __raw_spin_unlock_wait(arch_spinlock_t *lock) +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - while (__raw_spin_is_locked(lock)) + while (arch_spin_is_locked(lock)) cpu_relax(); } @@ -298,9 +298,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw) #define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) #define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define arch_spin_relax(lock) cpu_relax() +#define arch_read_relax(lock) cpu_relax() +#define arch_write_relax(lock) cpu_relax() /* The {read|write|spin}_lock() on x86 are full memory barriers. */ static inline void smp_mb__after_lock(void) { } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 5b75afac8a38..0a0aa1cec8f1 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -207,11 +207,11 @@ unsigned __kprobes long oops_begin(void) /* racy, but better than risking deadlock. */ raw_local_irq_save(flags); cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { + if (!arch_spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - __raw_spin_lock(&die_lock); + arch_spin_lock(&die_lock); } die_nest_count++; die_owner = cpu; @@ -231,7 +231,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); + arch_spin_unlock(&die_lock); raw_local_irq_restore(flags); oops_exit(); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index a0f39e090684..676b8c77a976 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -10,7 +10,7 @@ static inline void default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { - __raw_spin_lock(lock); + arch_spin_lock(lock); } struct pv_lock_ops pv_lock_ops = { diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index f1714697a09a..0aa5fed8b9e6 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void) * previous TSC that was measured (possibly on * another CPU) and update the previous TSC timestamp. */ - __raw_spin_lock(&sync_lock); + arch_spin_lock(&sync_lock); prev = last_tsc; rdtsc_barrier(); now = get_cycles(); rdtsc_barrier(); last_tsc = now; - __raw_spin_unlock(&sync_lock); + arch_spin_unlock(&sync_lock); /* * Be nice every now and then (and also check whether @@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void) * we saw a time-warp of the TSC going backwards: */ if (unlikely(prev > now)) { - __raw_spin_lock(&sync_lock); + arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); nr_warps++; - __raw_spin_unlock(&sync_lock); + arch_spin_unlock(&sync_lock); } } WARN(!(now-start), -- cgit v1.2.2 From fb3a6bbc912b12347614e5742c7c61416cdb0ca0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Dec 2009 20:01:19 +0100 Subject: locking: Convert raw_rwlock to arch_rwlock Not strictly necessary for -rt as -rt does not have non sleeping rwlocks, but it's odd to not have a consistent naming convention. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- arch/x86/include/asm/spinlock.h | 16 ++++++++-------- arch/x86/include/asm/spinlock_types.h | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index ab9055fd57d9..99cb86e843a0 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -232,7 +232,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_read_can_lock(raw_rwlock_t *lock) +static inline int __raw_read_can_lock(arch_rwlock_t *lock) { return (int)(lock)->lock > 0; } @@ -241,12 +241,12 @@ static inline int __raw_read_can_lock(raw_rwlock_t *lock) * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_write_can_lock(raw_rwlock_t *lock) +static inline int __raw_write_can_lock(arch_rwlock_t *lock) { return (lock)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void __raw_read_lock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" "jns 1f\n" @@ -255,7 +255,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw) ::LOCK_PTR_REG (rw) : "memory"); } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void __raw_write_lock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" "jz 1f\n" @@ -264,7 +264,7 @@ static inline void __raw_write_lock(raw_rwlock_t *rw) ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *lock) +static inline int __raw_read_trylock(arch_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -274,7 +274,7 @@ static inline int __raw_read_trylock(raw_rwlock_t *lock) return 0; } -static inline int __raw_write_trylock(raw_rwlock_t *lock) +static inline int __raw_write_trylock(arch_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -284,12 +284,12 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock) return 0; } -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void __raw_read_unlock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void __raw_write_unlock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX "addl %1, %0" : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 696f8364a4f3..dcb48b2edc11 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -13,8 +13,8 @@ typedef struct arch_spinlock { typedef struct { unsigned int lock; -} raw_rwlock_t; +} arch_rwlock_t; -#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } +#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } #endif /* _ASM_X86_SPINLOCK_TYPES_H */ -- cgit v1.2.2 From e5931943d02bf751b1ec849c0d2ade23d76a8d41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Dec 2009 20:08:46 +0100 Subject: locking: Convert raw_rwlock functions to arch_rwlock Name space cleanup for rwlock functions. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- arch/x86/include/asm/spinlock.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 99cb86e843a0..3089f70c0c52 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -232,7 +232,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_read_can_lock(arch_rwlock_t *lock) +static inline int arch_read_can_lock(arch_rwlock_t *lock) { return (int)(lock)->lock > 0; } @@ -241,12 +241,12 @@ static inline int __raw_read_can_lock(arch_rwlock_t *lock) * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_write_can_lock(arch_rwlock_t *lock) +static inline int arch_write_can_lock(arch_rwlock_t *lock) { return (lock)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(arch_rwlock_t *rw) +static inline void arch_read_lock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" "jns 1f\n" @@ -255,7 +255,7 @@ static inline void __raw_read_lock(arch_rwlock_t *rw) ::LOCK_PTR_REG (rw) : "memory"); } -static inline void __raw_write_lock(arch_rwlock_t *rw) +static inline void arch_write_lock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" "jz 1f\n" @@ -264,7 +264,7 @@ static inline void __raw_write_lock(arch_rwlock_t *rw) ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline int __raw_read_trylock(arch_rwlock_t *lock) +static inline int arch_read_trylock(arch_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -274,7 +274,7 @@ static inline int __raw_read_trylock(arch_rwlock_t *lock) return 0; } -static inline int __raw_write_trylock(arch_rwlock_t *lock) +static inline int arch_write_trylock(arch_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -284,19 +284,19 @@ static inline int __raw_write_trylock(arch_rwlock_t *lock) return 0; } -static inline void __raw_read_unlock(arch_rwlock_t *rw) +static inline void arch_read_unlock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(arch_rwlock_t *rw) +static inline void arch_write_unlock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX "addl %1, %0" : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); } -#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) -#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) +#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) +#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) #define arch_spin_relax(lock) cpu_relax() #define arch_read_relax(lock) cpu_relax() -- cgit v1.2.2 From 239007b8440abff689632f50cdf0f2b9e895b534 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 16:46:45 +0100 Subject: genirq: Convert irq_desc.lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/irq.c | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d5d498fbee4b..11a5851f1f50 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2431,7 +2431,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2450,7 +2450,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) } __get_cpu_var(vector_irq)[vector] = -1; unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } irq_exit(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 664bcb7384ac..91fd0c70a18a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc) return 0; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; @@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); out: - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -294,12 +294,12 @@ void fixup_irqs(void) continue; /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); affinity = desc->affinity; if (!irq_has_action(irq) || cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); continue; } @@ -326,7 +326,7 @@ void fixup_irqs(void) if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) desc->chip->unmask(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); @@ -356,10 +356,10 @@ void fixup_irqs(void) irq = __get_cpu_var(vector_irq)[vector]; desc = irq_to_desc(irq); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (desc->chip->retrigger) desc->chip->retrigger(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } } } -- cgit v1.2.2 From 03a05ed1152944000151d57b71000de287a1eb02 Mon Sep 17 00:00:00 2001 From: Zhao Yakui Date: Fri, 11 Dec 2009 15:17:20 +0800 Subject: ACPI: Use the ARB_DISABLE for the CPU which model id is less than 0x0f. Currently, ARB_DISABLE is a NOP on all of the recent Intel platforms. For such platforms, reduce contention on c3_lock by skipping the fake ARB_DISABLE. The cpu model id on one laptop is 14. If we disable ARB_DISABLE on this box, the box can't be booted correctly. But if we still enable ARB_DISABLE on this box, the box can be booted correctly. So we still use the ARB_DISABLE for the cpu which mode id is less than 0x0f. http://bugzilla.kernel.org/show_bug.cgi?id=14700 Signed-off-by: Zhao Yakui Acked-by: Pallipadi, Venkatesh cc: stable@kernel.org Signed-off-by: Len Brown --- arch/x86/kernel/acpi/cstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 59cdfa4686b2..2e837f5080fe 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); -- cgit v1.2.2 From e6428047725d72d63c1d9c4ba852e635e3ffe52a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 15 Dec 2009 16:28:13 -0600 Subject: x86: don't export inline function For CONFIG_PARAVIRT, load_gs_index is an inline function (it's #defined to native_load_gs_index otherwise). Exporting an inline function breaks the new assembler-based alphabetical sorted symbol list: Today's linux-next build (x86_64 allmodconfig) failed like this: .tmp_exports-asm.o: In function `__ksymtab_load_gs_index': (__ksymtab_sorted+0x5b40): undefined reference to `load_gs_index' Signed-off-by: Rusty Russell To: x86@kernel.org Cc: alan-jenkins@tuffmail.co.uk --- arch/x86/kernel/x8664_ksyms_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a1029769b6f2..084c1adc45f5 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -56,4 +56,6 @@ EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(init_level4_pgt); -EXPORT_SYMBOL(load_gs_index); +#ifndef CONFIG_PARAVIRT +EXPORT_SYMBOL(native_load_gs_index); +#endif -- cgit v1.2.2 From 186a25026c44d1bfa97671110ff14dcd0c99678e Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 15 Dec 2009 20:47:56 +0900 Subject: x86: Split swiotlb initialization into two stages The commit f4780ca005404166cc40af77ef0e86132ab98a81 moves swiotlb initialization before dma32_free_bootmem(). It's supposed to fix a bug that the commit 75f1cdf1dda92cae037ec848ae63690d91913eac introduced, we initialize SWIOTLB right after dma32_free_bootmem so we wrongly steal memory area allocated for GART with broken BIOS earlier. However, the above commit introduced another problem, which likely breaks machines with huge amount of memory. Such a box use the majority of DMA32_ZONE so there is no memory for swiotlb. With this patch, the x86 IOMMU initialization sequence are: 1. We set swiotlb to 1 in the case of (max_pfn > MAX_DMA32_PFN && !no_iommu). If swiotlb usage is forced by the boot option, we go to the step 3 and finish (we don't try to detect IOMMUs). 2. We call the detection functions of all the IOMMUs. The detection function sets x86_init.iommu.iommu_init to the IOMMU initialization function (so we can avoid calling the initialization functions of all the IOMMUs needlessly). 3. We initialize swiotlb (and set dma_ops to swiotlb_dma_ops) if swiotlb is set to 1. 4. If the IOMMU initialization function doesn't need swiotlb (e.g. the initialization is sucessful) then sets swiotlb to zero. 5. If we find that swiotlb is set to zero, we free swiotlb resource. Reported-by: Yinghai Lu Reported-by: Roland Dreier Signed-off-by: FUJITA Tomonori LKML-Reference: <20091215204729A.fujita.tomonori@lab.ntt.co.jp> Tested-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/swiotlb.h | 8 ++++++-- arch/x86/kernel/pci-dma.c | 9 ++++----- arch/x86/kernel/pci-swiotlb.c | 11 +++++++---- 3 files changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 87ffcb12a1b8..8085277e1b8b 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -5,13 +5,17 @@ #ifdef CONFIG_SWIOTLB extern int swiotlb; -extern int pci_swiotlb_init(void); +extern int __init pci_swiotlb_detect(void); +extern void __init pci_swiotlb_init(void); #else #define swiotlb 0 -static inline int pci_swiotlb_init(void) +static inline int pci_swiotlb_detect(void) { return 0; } +static inline void pci_swiotlb_init(void) +{ +} #endif static inline void dma_mark_clean(void *addr, size_t size) {} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index fcc2f2bfa39c..75e14e21f61a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -120,15 +120,12 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { - int use_swiotlb; - - use_swiotlb = pci_swiotlb_init(); #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif - if (use_swiotlb) - return; + if (pci_swiotlb_detect()) + goto out; gart_iommu_hole_init(); @@ -138,6 +135,8 @@ void __init pci_iommu_alloc(void) /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); +out: + pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e3c0a66b9e77..7d2829dde20e 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -43,12 +43,12 @@ static struct dma_map_ops swiotlb_dma_ops = { }; /* - * pci_swiotlb_init - initialize swiotlb if necessary + * pci_swiotlb_detect - set swiotlb to 1 if necessary * * This returns non-zero if we are forced to use swiotlb (by the boot * option). */ -int __init pci_swiotlb_init(void) +int __init pci_swiotlb_detect(void) { int use_swiotlb = swiotlb | swiotlb_force; @@ -60,10 +60,13 @@ int __init pci_swiotlb_init(void) if (swiotlb_force) swiotlb = 1; + return use_swiotlb; +} + +void __init pci_swiotlb_init(void) +{ if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } - - return use_swiotlb; } -- cgit v1.2.2 From 4e25b2576efda24c02e2d6b9bcb5965a3f865f33 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 14 Dec 2009 17:58:23 -0800 Subject: hugetlb: add generic definition of NUMA_NO_NODE Move definition of NUMA_NO_NODE from ia64 and x86_64 arch specific headers to generic header 'linux/numa.h' for use in generic code. NUMA_NO_NODE replaces bare '-1' where it's used in this series to indicate "no node id specified". Ultimately, it can be used to replace the -1 elsewhere where it is used similarly. Signed-off-by: Lee Schermerhorn Acked-by: David Rientjes Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: KAMEZAWA Hiroyuki Cc: Randy Dunlap Cc: Nishanth Aravamudan Cc: Adam Litke Cc: Andy Whitcroft Cc: Eric Whitney Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/topology.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 40e37b10c6c0..c5087d796587 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -35,11 +35,16 @@ # endif #endif -/* Node not present */ -#define NUMA_NO_NODE (-1) +/* + * to preserve the visibility of NUMA_NO_NODE definition, + * moved to there from here. May be used independent of + * CONFIG_NUMA. + */ +#include #ifdef CONFIG_NUMA #include + #include #ifdef CONFIG_X86_32 -- cgit v1.2.2 From 5f0a96b044d8edaee20f4a32ef6c393599ca55f8 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:32 -0800 Subject: cs5535-gpio: add AMD CS5535/CS5536 GPIO driver support This creates a CS5535/CS5536 GPIO driver which uses a gpio_chip backend (allowing GPIO users to use the generic GPIO API if desired) while also allowing architecture-specific users directly (via the cs5535_gpio_* functions). Tested on an OLPC machine. Some Leemotes also use CS5536 (with a mips cpu), which is why this is in drivers/gpio rather than arch/x86. Currently, it conflicts with older geode GPIO support; once MFGPT support is reworked to also be more generic, the older geode code will be removed. Signed-off-by: Andres Salomon Cc: Takashi Iwai Cc: Jordan Crouse Cc: David Brownell Reviewed-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/geode.h | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h index ad3c2ed75481..5716214d37d9 100644 --- a/arch/x86/include/asm/geode.h +++ b/arch/x86/include/asm/geode.h @@ -12,6 +12,7 @@ #include #include +#include /* Generic southbridge functions */ @@ -115,33 +116,6 @@ extern int geode_get_dev_base(unsigned int dev); #define VSA_VR_MEM_SIZE 0x0200 #define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */ #define GSW_VSA_SIG 0x534d /* General Software signature */ -/* GPIO */ - -#define GPIO_OUTPUT_VAL 0x00 -#define GPIO_OUTPUT_ENABLE 0x04 -#define GPIO_OUTPUT_OPEN_DRAIN 0x08 -#define GPIO_OUTPUT_INVERT 0x0C -#define GPIO_OUTPUT_AUX1 0x10 -#define GPIO_OUTPUT_AUX2 0x14 -#define GPIO_PULL_UP 0x18 -#define GPIO_PULL_DOWN 0x1C -#define GPIO_INPUT_ENABLE 0x20 -#define GPIO_INPUT_INVERT 0x24 -#define GPIO_INPUT_FILTER 0x28 -#define GPIO_INPUT_EVENT_COUNT 0x2C -#define GPIO_READ_BACK 0x30 -#define GPIO_INPUT_AUX1 0x34 -#define GPIO_EVENTS_ENABLE 0x38 -#define GPIO_LOCK_ENABLE 0x3C -#define GPIO_POSITIVE_EDGE_EN 0x40 -#define GPIO_NEGATIVE_EDGE_EN 0x44 -#define GPIO_POSITIVE_EDGE_STS 0x48 -#define GPIO_NEGATIVE_EDGE_STS 0x4C - -#define GPIO_MAP_X 0xE0 -#define GPIO_MAP_Y 0xE4 -#define GPIO_MAP_Z 0xE8 -#define GPIO_MAP_W 0xEC static inline u32 geode_gpio(unsigned int nr) { -- cgit v1.2.2 From 3c55494670745e523f69b56edb66ca0b50a470c2 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:36 -0800 Subject: ALSA: cs5535audio: free OLPC quirks from reliance on MGEODE_LX cpu optimization Previously, OLPC support for the mic extensions was only enabled in the ALSA driver if CONFIG_OLPC and CONFIG_MGEODE_LX were both set. This was because the old geode GPIO code was written in a manner that assumed CONFIG_MGEODE_LX. With the new cs553x-gpio driver, this is no longer the case; as such, we can drop the requirement on CONFIG_MGEODE_LX and instead include a requirement on GPIOLIB. We use the generic GPIO API rather than the cs553x-specific API. Signed-off-by: Andres Salomon Cc: Takashi Iwai Cc: Jordan Crouse Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/olpc.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 32a1918e1b88..6ad0985e8d76 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2024,6 +2024,7 @@ config GEODE_MFGPT_TIMER config OLPC bool "One Laptop Per Child support" + select GPIOLIB default n ---help--- Add support for detecting the unique features of the OLPC diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 834a30295fab..3a57385d9fa7 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -120,7 +120,7 @@ extern int olpc_ec_mask_unset(uint8_t bits); /* GPIO assignments */ -#define OLPC_GPIO_MIC_AC geode_gpio(1) +#define OLPC_GPIO_MIC_AC 1 #define OLPC_GPIO_DCON_IRQ geode_gpio(7) #define OLPC_GPIO_THRM_ALRM geode_gpio(10) #define OLPC_GPIO_SMB_CLK geode_gpio(14) -- cgit v1.2.2 From 82dca611bb516ec5fb7d04077733d6a4b70f52d1 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:37 -0800 Subject: cs5535: add a generic MFGPT driver This is based on the old code on arch/x86/kernel/mfgpt_32.c, except it's not x86 specific, it's modular, and it makes use of a PCI BAR rather than a random MSR. Currently module unloading is not supported; it's uncertain whether or not it can be made work with the hardware. [akpm@linux-foundation.org: add X86 dependency] Signed-off-by: Andres Salomon Cc: Jordan Crouse Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Chris Ball Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/geode.h | 40 ---------------------------------------- 1 file changed, 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h index 5716214d37d9..547e9642642a 100644 --- a/arch/x86/include/asm/geode.h +++ b/arch/x86/include/asm/geode.h @@ -47,16 +47,6 @@ extern int geode_get_dev_base(unsigned int dev); #define MSR_DIVIL_SOFT_RESET 0x51400017 -#define MSR_PIC_YSEL_LOW 0x51400020 -#define MSR_PIC_YSEL_HIGH 0x51400021 -#define MSR_PIC_ZSEL_LOW 0x51400022 -#define MSR_PIC_ZSEL_HIGH 0x51400023 -#define MSR_PIC_IRQM_LPC 0x51400025 - -#define MSR_MFGPT_IRQ 0x51400028 -#define MSR_MFGPT_NR 0x51400029 -#define MSR_MFGPT_SETUP 0x5140002B - #define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */ #define MSR_GX_GLD_MSR_CONFIG 0xC0002001 @@ -169,36 +159,6 @@ static inline int geode_has_vsa2(void) } #endif -/* MFGPTs */ - -#define MFGPT_MAX_TIMERS 8 -#define MFGPT_TIMER_ANY (-1) - -#define MFGPT_DOMAIN_WORKING 1 -#define MFGPT_DOMAIN_STANDBY 2 -#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY) - -#define MFGPT_CMP1 0 -#define MFGPT_CMP2 1 - -#define MFGPT_EVENT_IRQ 0 -#define MFGPT_EVENT_NMI 1 -#define MFGPT_EVENT_RESET 3 - -#define MFGPT_REG_CMP1 0 -#define MFGPT_REG_CMP2 2 -#define MFGPT_REG_COUNTER 4 -#define MFGPT_REG_SETUP 6 - -#define MFGPT_SETUP_CNTEN (1 << 15) -#define MFGPT_SETUP_CMP2 (1 << 14) -#define MFGPT_SETUP_CMP1 (1 << 13) -#define MFGPT_SETUP_SETUP (1 << 12) -#define MFGPT_SETUP_STOPEN (1 << 11) -#define MFGPT_SETUP_EXTEN (1 << 10) -#define MFGPT_SETUP_REVEN (1 << 5) -#define MFGPT_SETUP_CLKSEL (1 << 4) - static inline void geode_mfgpt_write(int timer, u16 reg, u16 value) { u32 base = geode_get_dev_base(GEODE_DEV_MFGPT); -- cgit v1.2.2 From 2e8c12436f540d3c40137ebf10268803dc972f6a Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:39 -0800 Subject: cs5535: move the DIVIL MSR definition into linux/cs5535.h The only thing that uses this is the reboot_fixups code. Signed-off-by: Andres Salomon Cc: Jordan Crouse Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Chris Ball Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/geode.h | 2 -- arch/x86/kernel/reboot_fixups_32.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h index 547e9642642a..976b3f11c009 100644 --- a/arch/x86/include/asm/geode.h +++ b/arch/x86/include/asm/geode.h @@ -45,8 +45,6 @@ extern int geode_get_dev_base(unsigned int dev); #define MSR_LBAR_ACPI 0x5140000E #define MSR_LBAR_PMS 0x5140000F -#define MSR_DIVIL_SOFT_RESET 0x51400017 - #define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */ #define MSR_GX_GLD_MSR_CONFIG 0xC0002001 diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index 201eab63b05f..fda313ebbb03 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include static void cs5530a_warm_reset(struct pci_dev *dev) { -- cgit v1.2.2 From f060f27007b393bac6e50ee6fc26d8505acf6fe4 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:40 -0800 Subject: cs5535: move VSA2 checks into linux/cs5535.h Signed-off-by: Andres Salomon Cc: Jordan Crouse Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Chris Ball Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/geode.h | 19 ------------------- arch/x86/kernel/geode_32.c | 22 ---------------------- arch/x86/kernel/olpc.c | 4 ++-- 3 files changed, 2 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h index 976b3f11c009..df1eaf87426a 100644 --- a/arch/x86/include/asm/geode.h +++ b/arch/x86/include/asm/geode.h @@ -95,16 +95,6 @@ extern int geode_get_dev_base(unsigned int dev); #define PM_AWKD 0x50 #define PM_SSC 0x54 -/* VSA2 magic values */ - -#define VSA_VRC_INDEX 0xAC1C -#define VSA_VRC_DATA 0xAC1E -#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */ -#define VSA_VR_SIGNATURE 0x0003 -#define VSA_VR_MEM_SIZE 0x0200 -#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */ -#define GSW_VSA_SIG 0x534d /* General Software signature */ - static inline u32 geode_gpio(unsigned int nr) { BUG_ON(nr > 28); @@ -148,15 +138,6 @@ static inline int is_geode(void) return (is_geode_gx() || is_geode_lx()); } -#ifdef CONFIG_MGEODE_LX -extern int geode_has_vsa2(void); -#else -static inline int geode_has_vsa2(void) -{ - return 0; -} -#endif - static inline void geode_mfgpt_write(int timer, u16 reg, u16 value) { u32 base = geode_get_dev_base(GEODE_DEV_MFGPT); diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index 9b08e852fd1a..9dad6ca6cd70 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -161,28 +161,6 @@ void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) } EXPORT_SYMBOL_GPL(geode_gpio_setup_event); -int geode_has_vsa2(void) -{ - static int has_vsa2 = -1; - - if (has_vsa2 == -1) { - u16 val; - - /* - * The VSA has virtual registers that we can query for a - * signature. - */ - outw(VSA_VR_UNLOCK, VSA_VRC_INDEX); - outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); - - val = inw(VSA_VRC_DATA); - has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG); - } - - return has_vsa2; -} -EXPORT_SYMBOL_GPL(geode_has_vsa2); - static int __init geode_southbridge_init(void) { if (!is_geode()) diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 4006c522adc7..9d1d263f786f 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -212,7 +212,7 @@ static int __init olpc_init(void) unsigned char *romsig; /* The ioremap check is dangerous; limit what we run it on */ - if (!is_geode() || geode_has_vsa2()) + if (!is_geode() || cs5535_has_vsa2()) return 0; spin_lock_init(&ec_lock); @@ -244,7 +244,7 @@ static int __init olpc_init(void) (unsigned char *) &olpc_platform_info.ecver, 1); /* check to see if the VSA exists */ - if (geode_has_vsa2()) + if (cs5535_has_vsa2()) olpc_platform_info.flags |= OLPC_F_VSA; printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", -- cgit v1.2.2 From f3a57a60d3e107d17aebb9e52b61c503e5bc14f9 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:40 -0800 Subject: cs5535: define lxfb/gxfb MSRs in linux/cs5535.h ..and include them in the lxfb/gxfb drivers rather than asm/geode.h (where possible). Signed-off-by: Andres Salomon Cc: Jordan Crouse Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Chris Ball Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/geode.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h index df1eaf87426a..ae104da6ad5a 100644 --- a/arch/x86/include/asm/geode.h +++ b/arch/x86/include/asm/geode.h @@ -31,25 +31,12 @@ extern int geode_get_dev_base(unsigned int dev); /* MSRS */ -#define MSR_GLIU_P2D_RO0 0x10000029 - -#define MSR_LX_GLD_MSR_CONFIG 0x48002001 -#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data - * sheet has the wrong value */ -#define MSR_GLCP_SYS_RSTPLL 0x4C000014 -#define MSR_GLCP_DOTPLL 0x4C000015 - #define MSR_LBAR_SMB 0x5140000B #define MSR_LBAR_GPIO 0x5140000C #define MSR_LBAR_MFGPT 0x5140000D #define MSR_LBAR_ACPI 0x5140000E #define MSR_LBAR_PMS 0x5140000F -#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */ - -#define MSR_GX_GLD_MSR_CONFIG 0xC0002001 -#define MSR_GX_MSR_PADSEL 0xC0002011 - /* Resource Sizes */ #define LBAR_GPIO_SIZE 0xFF -- cgit v1.2.2 From c95d1e53ed89b75a4d7b68d1cbae4607b1479243 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:41 -0800 Subject: cs5535: drop the Geode-specific MFGPT/GPIO code With generic modular drivers handling all of this stuff, the geode-specific code can go away. The cs5535-gpio, cs5535-mfgpt, and cs5535-clockevt drivers now handle this. Signed-off-by: Andres Salomon Cc: Jordan Crouse Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Chris Ball Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 10 -- arch/x86/include/asm/geode.h | 117 ------------ arch/x86/kernel/Makefile | 1 - arch/x86/kernel/geode_32.c | 174 ------------------ arch/x86/kernel/mfgpt_32.c | 410 ------------------------------------------- 5 files changed, 712 deletions(-) delete mode 100644 arch/x86/kernel/geode_32.c delete mode 100644 arch/x86/kernel/mfgpt_32.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6ad0985e8d76..3b2a5aca4edb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2012,16 +2012,6 @@ config SCx200HR_TIMER processor goes idle (as is done by the scheduler). The other workaround is idle=poll boot option. -config GEODE_MFGPT_TIMER - def_bool y - prompt "Geode Multi-Function General Purpose Timer (MFGPT) events" - depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS - ---help--- - This driver provides a clock event source based on the MFGPT - timer(s) in the CS5535 and CS5536 companion chip for the geode. - MFGPTs have a better resolution and max interval than the - generic PIT, and are suitable for use as high-res timers. - config OLPC bool "One Laptop Per Child support" select GPIOLIB diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h index ae104da6ad5a..7cd73552a4e8 100644 --- a/arch/x86/include/asm/geode.h +++ b/arch/x86/include/asm/geode.h @@ -14,98 +14,6 @@ #include #include -/* Generic southbridge functions */ - -#define GEODE_DEV_PMS 0 -#define GEODE_DEV_ACPI 1 -#define GEODE_DEV_GPIO 2 -#define GEODE_DEV_MFGPT 3 - -extern int geode_get_dev_base(unsigned int dev); - -/* Useful macros */ -#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS) -#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI) -#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO) -#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT) - -/* MSRS */ - -#define MSR_LBAR_SMB 0x5140000B -#define MSR_LBAR_GPIO 0x5140000C -#define MSR_LBAR_MFGPT 0x5140000D -#define MSR_LBAR_ACPI 0x5140000E -#define MSR_LBAR_PMS 0x5140000F - -/* Resource Sizes */ - -#define LBAR_GPIO_SIZE 0xFF -#define LBAR_MFGPT_SIZE 0x40 -#define LBAR_ACPI_SIZE 0x40 -#define LBAR_PMS_SIZE 0x80 - -/* ACPI registers (PMS block) */ - -/* - * PM1_EN is only valid when VSA is enabled for 16 bit reads. - * When VSA is not enabled, *always* read both PM1_STS and PM1_EN - * with a 32 bit read at offset 0x0 - */ - -#define PM1_STS 0x00 -#define PM1_EN 0x02 -#define PM1_CNT 0x08 -#define PM2_CNT 0x0C -#define PM_TMR 0x10 -#define PM_GPE0_STS 0x18 -#define PM_GPE0_EN 0x1C - -/* PMC registers (PMS block) */ - -#define PM_SSD 0x00 -#define PM_SCXA 0x04 -#define PM_SCYA 0x08 -#define PM_OUT_SLPCTL 0x0C -#define PM_SCLK 0x10 -#define PM_SED 0x1 -#define PM_SCXD 0x18 -#define PM_SCYD 0x1C -#define PM_IN_SLPCTL 0x20 -#define PM_WKD 0x30 -#define PM_WKXD 0x34 -#define PM_RD 0x38 -#define PM_WKXA 0x3C -#define PM_FSD 0x40 -#define PM_TSD 0x44 -#define PM_PSD 0x48 -#define PM_NWKD 0x4C -#define PM_AWKD 0x50 -#define PM_SSC 0x54 - -static inline u32 geode_gpio(unsigned int nr) -{ - BUG_ON(nr > 28); - return 1 << nr; -} - -extern void geode_gpio_set(u32, unsigned int); -extern void geode_gpio_clear(u32, unsigned int); -extern int geode_gpio_isset(u32, unsigned int); -extern void geode_gpio_setup_event(unsigned int, int, int); -extern void geode_gpio_set_irq(unsigned int, unsigned int); - -static inline void geode_gpio_event_irq(unsigned int gpio, int pair) -{ - geode_gpio_setup_event(gpio, pair, 0); -} - -static inline void geode_gpio_event_pme(unsigned int gpio, int pair) -{ - geode_gpio_setup_event(gpio, pair, 1); -} - -/* Specific geode tests */ - static inline int is_geode_gx(void) { return ((boot_cpu_data.x86_vendor == X86_VENDOR_NSC) && @@ -125,29 +33,4 @@ static inline int is_geode(void) return (is_geode_gx() || is_geode_lx()); } -static inline void geode_mfgpt_write(int timer, u16 reg, u16 value) -{ - u32 base = geode_get_dev_base(GEODE_DEV_MFGPT); - outw(value, base + reg + (timer * 8)); -} - -static inline u16 geode_mfgpt_read(int timer, u16 reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_MFGPT); - return inw(base + reg + (timer * 8)); -} - -extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable); -extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable); -extern int geode_mfgpt_alloc_timer(int timer, int domain); - -#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1) -#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0) - -#ifdef CONFIG_GEODE_MFGPT_TIMER -extern int __init mfgpt_timer_setup(void); -#else -static inline int mfgpt_timer_setup(void) { return 0; } -#endif - #endif /* _ASM_X86_GEODE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4f2e66e29ecc..d87f09bc5a52 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -89,7 +89,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c deleted file mode 100644 index 9dad6ca6cd70..000000000000 --- a/arch/x86/kernel/geode_32.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * AMD Geode southbridge support code - * Copyright (C) 2006, Advanced Micro Devices, Inc. - * Copyright (C) 2007, Andres Salomon - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include - -static struct { - char *name; - u32 msr; - int size; - u32 base; -} lbars[] = { - { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 }, - { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 }, - { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 }, - { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 } -}; - -static void __init init_lbars(void) -{ - u32 lo, hi; - int i; - - for (i = 0; i < ARRAY_SIZE(lbars); i++) { - rdmsr(lbars[i].msr, lo, hi); - if (hi & 0x01) - lbars[i].base = lo & 0x0000ffff; - - if (lbars[i].base == 0) - printk(KERN_ERR "geode: Couldn't initialize '%s'\n", - lbars[i].name); - } -} - -int geode_get_dev_base(unsigned int dev) -{ - BUG_ON(dev >= ARRAY_SIZE(lbars)); - return lbars[dev].base; -} -EXPORT_SYMBOL_GPL(geode_get_dev_base); - -/* === GPIO API === */ - -void geode_gpio_set(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - - if (!base) - return; - - /* low bank register */ - if (gpio & 0xFFFF) - outl(gpio & 0xFFFF, base + reg); - /* high bank register */ - gpio >>= 16; - if (gpio) - outl(gpio, base + 0x80 + reg); -} -EXPORT_SYMBOL_GPL(geode_gpio_set); - -void geode_gpio_clear(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - - if (!base) - return; - - /* low bank register */ - if (gpio & 0xFFFF) - outl((gpio & 0xFFFF) << 16, base + reg); - /* high bank register */ - gpio &= (0xFFFF << 16); - if (gpio) - outl(gpio, base + 0x80 + reg); -} -EXPORT_SYMBOL_GPL(geode_gpio_clear); - -int geode_gpio_isset(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - u32 val; - - if (!base) - return 0; - - /* low bank register */ - if (gpio & 0xFFFF) { - val = inl(base + reg) & (gpio & 0xFFFF); - if ((gpio & 0xFFFF) == val) - return 1; - } - /* high bank register */ - gpio >>= 16; - if (gpio) { - val = inl(base + 0x80 + reg) & gpio; - if (gpio == val) - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(geode_gpio_isset); - -void geode_gpio_set_irq(unsigned int group, unsigned int irq) -{ - u32 lo, hi; - - if (group > 7 || irq > 15) - return; - - rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi); - - lo &= ~(0xF << (group * 4)); - lo |= (irq & 0xF) << (group * 4); - - wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi); -} -EXPORT_SYMBOL_GPL(geode_gpio_set_irq); - -void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - u32 offset, shift, val; - - if (gpio >= 24) - offset = GPIO_MAP_W; - else if (gpio >= 16) - offset = GPIO_MAP_Z; - else if (gpio >= 8) - offset = GPIO_MAP_Y; - else - offset = GPIO_MAP_X; - - shift = (gpio % 8) * 4; - - val = inl(base + offset); - - /* Clear whatever was there before */ - val &= ~(0xF << shift); - - /* And set the new value */ - - val |= ((pair & 7) << shift); - - /* Set the PME bit if this is a PME event */ - - if (pme) - val |= (1 << (shift + 3)); - - outl(val, base + offset); -} -EXPORT_SYMBOL_GPL(geode_gpio_setup_event); - -static int __init geode_southbridge_init(void) -{ - if (!is_geode()) - return -ENODEV; - - init_lbars(); - (void) mfgpt_timer_setup(); - return 0; -} - -postcore_initcall(geode_southbridge_init); diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c deleted file mode 100644 index 2a62d843f015..000000000000 --- a/arch/x86/kernel/mfgpt_32.c +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT) - * - * Copyright (C) 2006, Advanced Micro Devices, Inc. - * Copyright (C) 2007, Andres Salomon - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book. - */ - -/* - * We are using the 32.768kHz input clock - it's the only one that has the - * ranges we find desirable. The following table lists the suitable - * divisors and the associated Hz, minimum interval and the maximum interval: - * - * Divisor Hz Min Delta (s) Max Delta (s) - * 1 32768 .00048828125 2.000 - * 2 16384 .0009765625 4.000 - * 4 8192 .001953125 8.000 - * 8 4096 .00390625 16.000 - * 16 2048 .0078125 32.000 - * 32 1024 .015625 64.000 - * 64 512 .03125 128.000 - * 128 256 .0625 256.000 - * 256 128 .125 512.000 - */ - -#include -#include -#include -#include - -#define MFGPT_DEFAULT_IRQ 7 - -static struct mfgpt_timer_t { - unsigned int avail:1; -} mfgpt_timers[MFGPT_MAX_TIMERS]; - -/* Selected from the table above */ - -#define MFGPT_DIVISOR 16 -#define MFGPT_SCALE 4 /* divisor = 2^(scale) */ -#define MFGPT_HZ (32768 / MFGPT_DIVISOR) -#define MFGPT_PERIODIC (MFGPT_HZ / HZ) - -/* Allow for disabling of MFGPTs */ -static int disable; -static int __init mfgpt_disable(char *s) -{ - disable = 1; - return 1; -} -__setup("nomfgpt", mfgpt_disable); - -/* Reset the MFGPT timers. This is required by some broken BIOSes which already - * do the same and leave the system in an unstable state. TinyBIOS 0.98 is - * affected at least (0.99 is OK with MFGPT workaround left to off). - */ -static int __init mfgpt_fix(char *s) -{ - u32 val, dummy; - - /* The following udocumented bit resets the MFGPT timers */ - val = 0xFF; dummy = 0; - wrmsr(MSR_MFGPT_SETUP, val, dummy); - return 1; -} -__setup("mfgptfix", mfgpt_fix); - -/* - * Check whether any MFGPTs are available for the kernel to use. In most - * cases, firmware that uses AMD's VSA code will claim all timers during - * bootup; we certainly don't want to take them if they're already in use. - * In other cases (such as with VSAless OpenFirmware), the system firmware - * leaves timers available for us to use. - */ - - -static int timers = -1; - -static void geode_mfgpt_detect(void) -{ - int i; - u16 val; - - timers = 0; - - if (disable) { - printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n"); - goto done; - } - - if (!geode_get_dev_base(GEODE_DEV_MFGPT)) { - printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n"); - goto done; - } - - for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - val = geode_mfgpt_read(i, MFGPT_REG_SETUP); - if (!(val & MFGPT_SETUP_SETUP)) { - mfgpt_timers[i].avail = 1; - timers++; - } - } - -done: - printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers); -} - -int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) -{ - u32 msr, mask, value, dummy; - int shift = (cmp == MFGPT_CMP1) ? 0 : 8; - - if (timer < 0 || timer >= MFGPT_MAX_TIMERS) - return -EIO; - - /* - * The register maps for these are described in sections 6.17.1.x of - * the AMD Geode CS5536 Companion Device Data Book. - */ - switch (event) { - case MFGPT_EVENT_RESET: - /* - * XXX: According to the docs, we cannot reset timers above - * 6; that is, resets for 7 and 8 will be ignored. Is this - * a problem? -dilinger - */ - msr = MSR_MFGPT_NR; - mask = 1 << (timer + 24); - break; - - case MFGPT_EVENT_NMI: - msr = MSR_MFGPT_NR; - mask = 1 << (timer + shift); - break; - - case MFGPT_EVENT_IRQ: - msr = MSR_MFGPT_IRQ; - mask = 1 << (timer + shift); - break; - - default: - return -EIO; - } - - rdmsr(msr, value, dummy); - - if (enable) - value |= mask; - else - value &= ~mask; - - wrmsr(msr, value, dummy); - return 0; -} -EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); - -int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable) -{ - u32 zsel, lpc, dummy; - int shift; - - if (timer < 0 || timer >= MFGPT_MAX_TIMERS) - return -EIO; - - /* - * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA - * is using the same CMP of the timer's Siamese twin, the IRQ is set to - * 2, and we mustn't use nor change it. - * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the - * IRQ of the 1st. This can only happen if forcing an IRQ, calling this - * with *irq==0 is safe. Currently there _are_ no 2 drivers. - */ - rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); - shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4; - if (((zsel >> shift) & 0xF) == 2) - return -EIO; - - /* Choose IRQ: if none supplied, keep IRQ already set or use default */ - if (!*irq) - *irq = (zsel >> shift) & 0xF; - if (!*irq) - *irq = MFGPT_DEFAULT_IRQ; - - /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */ - if (*irq < 1 || *irq == 2 || *irq > 15) - return -EIO; - rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy); - if (lpc & (1 << *irq)) - return -EIO; - - /* All chosen and checked - go for it */ - if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) - return -EIO; - if (enable) { - zsel = (zsel & ~(0xF << shift)) | (*irq << shift); - wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); - } - - return 0; -} - -static int mfgpt_get(int timer) -{ - mfgpt_timers[timer].avail = 0; - printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer); - return timer; -} - -int geode_mfgpt_alloc_timer(int timer, int domain) -{ - int i; - - if (timers == -1) { - /* timers haven't been detected yet */ - geode_mfgpt_detect(); - } - - if (!timers) - return -1; - - if (timer >= MFGPT_MAX_TIMERS) - return -1; - - if (timer < 0) { - /* Try to find an available timer */ - for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - if (mfgpt_timers[i].avail) - return mfgpt_get(i); - - if (i == 5 && domain == MFGPT_DOMAIN_WORKING) - break; - } - } else { - /* If they requested a specific timer, try to honor that */ - if (mfgpt_timers[timer].avail) - return mfgpt_get(timer); - } - - /* No timers available - too bad */ - return -1; -} -EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); - - -#ifdef CONFIG_GEODE_MFGPT_TIMER - -/* - * The MFPGT timers on the CS5536 provide us with suitable timers to use - * as clock event sources - not as good as a HPET or APIC, but certainly - * better than the PIT. This isn't a general purpose MFGPT driver, but - * a simplified one designed specifically to act as a clock event source. - * For full details about the MFGPT, please consult the CS5536 data sheet. - */ - -#include -#include - -static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; -static u16 mfgpt_event_clock; - -static int irq; -static int __init mfgpt_setup(char *str) -{ - get_option(&str, &irq); - return 1; -} -__setup("mfgpt_irq=", mfgpt_setup); - -static void mfgpt_disable_timer(u16 clock) -{ - /* avoid races by clearing CMP1 and CMP2 unconditionally */ - geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN | - MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2); -} - -static int mfgpt_next_event(unsigned long, struct clock_event_device *); -static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *); - -static struct clock_event_device mfgpt_clockevent = { - .name = "mfgpt-timer", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = mfgpt_set_mode, - .set_next_event = mfgpt_next_event, - .rating = 250, - .cpumask = cpu_all_mask, - .shift = 32 -}; - -static void mfgpt_start_timer(u16 delta) -{ - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta); - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); - - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, - MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2); -} - -static void mfgpt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - mfgpt_disable_timer(mfgpt_event_clock); - - if (mode == CLOCK_EVT_MODE_PERIODIC) - mfgpt_start_timer(MFGPT_PERIODIC); - - mfgpt_tick_mode = mode; -} - -static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt) -{ - mfgpt_start_timer(delta); - return 0; -} - -static irqreturn_t mfgpt_tick(int irq, void *dev_id) -{ - u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP); - - /* See if the interrupt was for us */ - if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1))) - return IRQ_NONE; - - /* Turn off the clock (and clear the event) */ - mfgpt_disable_timer(mfgpt_event_clock); - - if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN) - return IRQ_HANDLED; - - /* Clear the counter */ - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); - - /* Restart the clock in periodic mode */ - - if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) { - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, - MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2); - } - - mfgpt_clockevent.event_handler(&mfgpt_clockevent); - return IRQ_HANDLED; -} - -static struct irqaction mfgptirq = { - .handler = mfgpt_tick, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, - .name = "mfgpt-timer" -}; - -int __init mfgpt_timer_setup(void) -{ - int timer, ret; - u16 val; - - timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING); - if (timer < 0) { - printk(KERN_ERR - "mfgpt-timer: Could not allocate a MFPGT timer\n"); - return -ENODEV; - } - - mfgpt_event_clock = timer; - - /* Set up the IRQ on the MFGPT side */ - if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) { - printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); - return -EIO; - } - - /* And register it with the kernel */ - ret = setup_irq(irq, &mfgptirq); - - if (ret) { - printk(KERN_ERR - "mfgpt-timer: Unable to set up the interrupt.\n"); - goto err; - } - - /* Set the clock scale and enable the event mode for CMP2 */ - val = MFGPT_SCALE | (3 << 8); - - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val); - - /* Set up the clock event */ - mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, - mfgpt_clockevent.shift); - mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF, - &mfgpt_clockevent); - mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE, - &mfgpt_clockevent); - - printk(KERN_INFO - "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n", - timer, irq); - clockevents_register_device(&mfgpt_clockevent); - - return 0; - -err: - geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq); - printk(KERN_ERR - "mfgpt-timer: Unable to set up the MFGPT clock source\n"); - return -EIO; -} - -#endif -- cgit v1.2.2 From e7d2860b690d4f3bed6824757c540579638e3d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= Date: Mon, 14 Dec 2009 18:01:06 -0800 Subject: tree-wide: convert open calls to remove spaces to skip_spaces() lib function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes use of skip_spaces() defined in lib/string.c for removing leading spaces from strings all over the tree. It decreases lib.a code size by 47 bytes and reuses the function tree-wide: text data bss dec hex filename 64688 584 592 65864 10148 (TOTALS-BEFORE) 64641 584 592 65817 10119 (TOTALS-AFTER) Also, while at it, if we see (*str && isspace(*str)), we can be sure to remove the first condition (*str) as the second one (isspace(*str)) also evaluates to 0 whenever *str == 0, making it redundant. In other words, "a char equals zero is never a space". Julia Lawall tried the semantic patch (http://coccinelle.lip6.fr) below, and found occurrences of this pattern on 3 more files: drivers/leds/led-class.c drivers/leds/ledtrig-timer.c drivers/video/output.c @@ expression str; @@ ( // ignore skip_spaces cases while (*str && isspace(*str)) { \(str++;\|++str;\) } | - *str && isspace(*str) ) Signed-off-by: André Goddard Rosa Cc: Julia Lawall Cc: Martin Schwidefsky Cc: Jeff Dike Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Richard Purdie Cc: Neil Brown Cc: Kyle McMartin Cc: Henrique de Moraes Holschuh Cc: David Howells Cc: Cc: Samuel Ortiz Cc: Patrick McHardy Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/if.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 3c1b12d461d1..e006e56f699c 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #define LINE_SIZE 80 @@ -133,8 +134,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EINVAL; base = simple_strtoull(line + 5, &ptr, 0); - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); if (strncmp(ptr, "size=", 5)) return -EINVAL; @@ -142,14 +142,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); if (strncmp(ptr, "type=", 5)) return -EINVAL; - ptr += 5; - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr + 5); for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) -- cgit v1.2.2 From 23637568ad0c9b5ab0ad27d2f2f26d1e9282c527 Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Sun, 13 Dec 2009 16:04:38 -0600 Subject: x86: Fix kprobes build with non-gawk awk The instruction attribute table generator fails when run by mawk or original-awk: $ mawk -f arch/x86/tools/gen-insn-attr-x86.awk \ arch/x86/lib/x86-opcode-map.txt > /dev/null Semantic error at 240: Second IMM error $ echo $? 1 Line 240 contains "c8: ENTER Iw,Ib", which indicates that this instruction has two immediate operands, the second of which is one byte. The script loops through the immediate operands using a for loop. Unfortunately, there is no guarantee in awk that a for (variable in array) loop will return the indices in increasing order. Internally, both original-awk and mawk iterate over a hash table for this purpose, and both implementations happen to produce the index 2 before 1. The supposed second immediate operand is more than one byte wide, producing the error. So loop over the indices in increasing order instead. As a side-effect, with mawk this means the silly two-entry hash table never has to be built. Signed-off-by: Jonathan Nieder Acked-by Masami Hiramatsu Cc: Jim Keniston Cc: Frederic Weisbecker LKML-Reference: <20091213220437.GA27718@progeny.tock> Signed-off-by: Ingo Molnar --- arch/x86/tools/gen-insn-attr-x86.awk | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index e34e92a28eb6..7a6850683c34 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -226,12 +226,12 @@ function add_flags(old,new) { } # convert operands to flags. -function convert_operands(opnd, i,imm,mod) +function convert_operands(count,opnd, i,j,imm,mod) { imm = null mod = null - for (i in opnd) { - i = opnd[i] + for (j = 1; j <= count; j++) { + i = opnd[j] if (match(i, imm_expr) == 1) { if (!imm_flag[i]) semantic_error("Unknown imm opnd: " i) @@ -282,8 +282,8 @@ function convert_operands(opnd, i,imm,mod) # parse one opcode if (match($i, opnd_expr)) { opnd = $i - split($(i++), opnds, ",") - flags = convert_operands(opnds) + count = split($(i++), opnds, ",") + flags = convert_operands(count, opnds) } if (match($i, ext_expr)) ext = $(i++) -- cgit v1.2.2 From 0b962d473af32ec334df271b54ff4973cb2b4c73 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 15 Dec 2009 15:13:07 -0800 Subject: x86, msr/cpuid: Register enough minors for the MSR and CPUID drivers register_chrdev() hardcodes registering 256 minors, presumably to avoid breaking old drivers. However, we need to register enough minors so that we have all possible CPUs. checkpatch warns on this patch, but the patch is correct: NR_CPUS here is a static *upper bound* on the *maximum CPU index* (not *number of CPUs!*) and that is what we want. Reported-and-tested-by: Russ Anderson Cc: Tejun Heo Cc: Alan Cox Cc: Takashi Iwai Cc: Alexander Viro Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpuid.c | 5 +++-- arch/x86/kernel/msr.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7ef24a796992..cb27fd6136c9 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -187,7 +187,8 @@ static int __init cpuid_init(void) int i, err = 0; i = 0; - if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { + if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, + "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); err = -EBUSY; @@ -216,7 +217,7 @@ out_class: } class_destroy(cpuid_class); out_chrdev: - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); out: return err; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 572b07eee3f4..4bd93c9b2b27 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -246,7 +246,7 @@ static int __init msr_init(void) int i, err = 0; i = 0; - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { printk(KERN_ERR "msr: unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; @@ -274,7 +274,7 @@ out_class: msr_device_destroy(i); class_destroy(msr_class); out_chrdev: - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); out: return err; } -- cgit v1.2.2 From 5df974009fe513c664303de24725ea0f8b47f12e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 16 Dec 2009 13:48:04 +0800 Subject: x86: Add IA32_TSC_AUX MSR and use it Clean up write_tsc() and write_tscp_aux() by replacing hardcoded values. No change in functionality. Signed-off-by: Sheng Yang Cc: Avi Kivity Cc: Marcelo Tosatti LKML-Reference: <1260942485-19156-4-git-send-email-sheng@linux.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/msr.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4ffe09b2ad75..ac98d2914ebf 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -12,6 +12,7 @@ #define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ #define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ #define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ +#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ /* EFER bits: */ #define _EFER_SCE 0 /* SYSCALL/SYSRET */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 2d228fc9b4b7..cf985aa00660 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -240,9 +240,9 @@ do { \ #define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) -#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2)) +#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) -#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) +#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0) struct msr *msrs_alloc(void); void msrs_free(struct msr *msrs); -- cgit v1.2.2 From 7f38551fc3ff0e17a38d6f3f0f8831380a88f3cc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:20 -0800 Subject: ptrace: x86: implement user_single_step_siginfo() Suggested by Roland. Implement user_single_step_siginfo() for x86. Extract this code from send_sigtrap(). Since x86 calls tracehook_report_syscall_exit(step => 0) the new helper is not used yet. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/ptrace.h | 2 ++ arch/x86/kernel/ptrace.c | 30 +++++++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 3d11fd0f44c5..9d369f680321 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -292,6 +292,8 @@ extern void user_enable_block_step(struct task_struct *); #define arch_has_block_step() (boot_cpu_data.x86 >= 6) #endif +#define ARCH_HAS_USER_SINGLE_STEP_INFO + struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7079ddaf0731..77b60085a810 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1676,21 +1676,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code) +static void fill_sigtrap_info(struct task_struct *tsk, + struct pt_regs *regs, + int error_code, int si_code, + struct siginfo *info) { - struct siginfo info; - tsk->thread.trap_no = 1; tsk->thread.error_code = error_code; - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = si_code; + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = si_code; + info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; +} + +void user_single_step_siginfo(struct task_struct *tsk, + struct pt_regs *regs, + struct siginfo *info) +{ + fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info); +} - /* User-mode ip? */ - info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) +{ + struct siginfo info; + fill_sigtrap_info(tsk, regs, error_code, si_code, &info); /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); } -- cgit v1.2.2 From d51965037325e51f6cd68583413243c3573e47b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:21 -0800 Subject: ptrace: x86: change syscall_trace_leave() to rely on tracehook when stepping Suggested by Roland. Unlike powepc, x86 always calls tracehook_report_syscall_exit(step) with step = 0, and sends the trap by hand. This results in unnecessary SIGTRAP when PTRACE_SINGLESTEP follows the syscall-exit stop. Change syscall_trace_leave() to pass the correct "step" argument to tracehook and remove the send_sigtrap() logic. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 77b60085a810..2779321046bd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1767,29 +1767,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) asmregparm void syscall_trace_leave(struct pt_regs *regs) { + bool step; + if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, 0); - /* * If TIF_SYSCALL_EMU is set, we only get here because of * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). * We already reported this syscall instruction in - * syscall_trace_enter(), so don't do any more now. - */ - if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) - return; - - /* - * If we are single-stepping, synthesize a trap to follow the - * system call instruction. + * syscall_trace_enter(). */ - if (test_thread_flag(TIF_SINGLESTEP) && - tracehook_consider_fatal_signal(current, SIGTRAP)) - send_sigtrap(current, regs, 0, TRAP_BRKPT); + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && + !test_thread_flag(TIF_SYSCALL_EMU); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); } -- cgit v1.2.2 From 698ba7b5a3a7be772922340fade365c675b8243f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Dec 2009 16:47:37 -0800 Subject: elf: kill USE_ELF_CORE_DUMP Currently all architectures but microblaze unconditionally define USE_ELF_CORE_DUMP. The microblaze omission seems like an error to me, so let's kill this ifdef and make sure we are the same everywhere. Signed-off-by: Christoph Hellwig Acked-by: Hugh Dickins Cc: Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/elf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 8a024babe5e6..b4501ee223ad 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -239,7 +239,6 @@ extern int force_personality32; #endif /* !CONFIG_X86_32 */ #define CORE_DUMP_USE_REGSET -#define USE_ELF_CORE_DUMP #define ELF_EXEC_PAGESIZE 4096 /* This is the location that an ET_DYN program is loaded if exec'ed. Typical -- cgit v1.2.2 From ac2b3e67dd59b8c6ef8c199641444c6ea03535a6 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 15 Dec 2009 16:47:43 -0800 Subject: dma-mapping: fix off-by-one error in dma_capable() dma_mask is, when interpreted as address, the last valid byte, and hence comparison msut also be done using the last valid of the buffer in question. Also fix the open-coded instances in lib/swiotlb.c. Signed-off-by: Jan Beulich Cc: FUJITA Tomonori Cc: Becky Bruce Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 0f6c02f3b7d4..ac91eed21061 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -67,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) if (!dev->dma_mask) return 0; - return addr + size <= *dev->dma_mask; + return addr + size - 1 <= *dev->dma_mask; } static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) -- cgit v1.2.2 From 729d69e6995fc4dea8ff70df256a7d4034a3d21d Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 15 Dec 2009 16:47:52 -0800 Subject: x86: uv: introduce a means to translate from gpa -> socket_paddr The UV BIOS has been updated to implement some of our interface functionality differently than originally expected. These patches update the kernel to the bios implementation and include a few minor bug fixes which prevent us from doing significant testing on real hardware. This patch: For SGI UV systems, translate from a global physical address back to a socket physical address. This does nothing to ensure the socket physical address is actually addressable by the kernel. That is the responsibility of the user of the function. Signed-off-by: Robin Holt Cc: Jack Steiner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uv/uv_hub.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index d1414af98559..0cc955f16aec 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -232,6 +232,19 @@ static inline unsigned long uv_gpa(void *v) return uv_soc_phys_ram_to_gpa(__pa(v)); } +/* UV global physical address --> socket phys RAM */ +static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) +{ + unsigned long paddr = gpa & uv_hub_info->gpa_mask; + unsigned long remap_base = uv_hub_info->lowmem_remap_base; + unsigned long remap_top = uv_hub_info->lowmem_remap_top; + + if (paddr >= remap_base && paddr < remap_base + remap_top) + paddr -= remap_base; + return paddr; +} + + /* gnode -> pnode */ static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) { -- cgit v1.2.2 From fae419f2abd15ab7d1cd1413e6683a276a4e14e2 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 15 Dec 2009 16:47:54 -0800 Subject: x86: uv: introduce uv_gpa_is_mmr Provide a mechanism for determining if a global physical address is pointing to a UV hub MMR. Signed-off-by: Robin Holt Cc: Jack Steiner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uv/uv_hub.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 0cc955f16aec..8f1332bbfd72 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -232,6 +232,13 @@ static inline unsigned long uv_gpa(void *v) return uv_soc_phys_ram_to_gpa(__pa(v)); } +/* Top two bits indicate the requested address is in MMR space. */ +static inline int +uv_gpa_in_mmr_space(unsigned long gpa) +{ + return (gpa >> 62) == 0x3UL; +} + /* UV global physical address --> socket phys RAM */ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) { -- cgit v1.2.2 From c2c9f115741453715d6b4da1cd2de65af8c7ad86 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 15 Dec 2009 16:47:56 -0800 Subject: x86: uv: update XPC to handle updated BIOS interface The UV BIOS has moved the location of some of their pointers to the "partition reserved page" from memory into a uv hub MMR. The GRU does not support bcopy operations from MMR space so we need to special case the MMR addresses using VLOAD operations. Additionally, the BIOS call for registering a message queue watchlist has removed the 'blade' value and eliminated the structure that was being passed in. This is also reflected in this patch. Signed-off-by: Robin Holt Cc: Jack Steiner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uv/bios.h | 11 +---------- arch/x86/kernel/bios_uv.c | 8 ++------ 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 7ed17ff502b9..2751f3075d8b 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -76,15 +76,6 @@ union partition_info_u { }; }; -union uv_watchlist_u { - u64 val; - struct { - u64 blade : 16, - size : 32, - filler : 16; - }; -}; - enum uv_memprotect { UV_MEMPROT_RESTRICT_ACCESS, UV_MEMPROT_ALLOW_AMO, @@ -100,7 +91,7 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); -extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, +extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 63a88e1f987d..b0206a211b09 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -101,21 +101,17 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, } int -uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, +uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, unsigned long *intr_mmr_offset) { - union uv_watchlist_u size_blade; u64 watchlist; s64 ret; - size_blade.size = mq_size; - size_blade.blade = blade; - /* * bios returns watchlist number or negative error number. */ ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, - size_blade.val, (u64)intr_mmr_offset, + mq_size, (u64)intr_mmr_offset, (u64)&watchlist, 0); if (ret < BIOS_STATUS_SUCCESS) return ret; -- cgit v1.2.2 From 56abcf24ff993291b20efd6e3402cd3d12f5cee2 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 15 Dec 2009 16:48:20 -0800 Subject: gru: function to generate chipset IPI values Create a function to generate the value that is written to the UV hub MMR to cause an IPI interrupt to be sent. The function will be used in the GRU message queue error recovery code that sends IPIs to nodes in remote partitions. Signed-off-by: Jack Steiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uv/uv_hub.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 8f1332bbfd72..811bfabc80b7 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -172,6 +172,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) #define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) +#define UV_GLOBAL_GRU_MMR_BASE 0x4000000 + #define UV_GLOBAL_MMR32_PNODE_SHIFT 15 #define UV_GLOBAL_MMR64_PNODE_SHIFT 26 @@ -327,6 +329,15 @@ static inline unsigned long uv_read_global_mmr64(int pnode, return readq(uv_global_mmr64_address(pnode, offset)); } +/* + * Global MMR space addresses when referenced by the GRU. (GRU does + * NOT use socket addressing). + */ +static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) +{ + return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val); +} + /* * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. @@ -454,6 +465,14 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) } } +static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) +{ + return (1UL << UVH_IPI_INT_SEND_SHFT) | + ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | + (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); +} + static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) { unsigned long val; @@ -462,10 +481,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) if (vector == NMI_VECTOR) dmode = dest_NMI; - val = (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | - (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | - (vector << UVH_IPI_INT_VECTOR_SHFT); + val = uv_hub_ipi_value(apicid, vector, dmode); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } -- cgit v1.2.2 From a66022c457755b5eef61e30866114679c95e1f54 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 15 Dec 2009 16:48:28 -0800 Subject: iommu-helper: use bitmap library Use bitmap library and kill some unused iommu helper functions. 1. s/iommu_area_free/bitmap_clear/ 2. s/iommu_area_reserve/bitmap_set/ 3. Use bitmap_find_next_zero_area instead of find_next_zero_area This cannot be simple substitution because find_next_zero_area doesn't check the last bit of the limit in bitmap 4. Remove iommu_area_free, iommu_area_reserve, and find_next_zero_area Signed-off-by: Akinobu Mita Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: FUJITA Tomonori Cc: Joerg Roedel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_iommu.c | 4 ++-- arch/x86/kernel/pci-calgary_64.c | 6 +++--- arch/x86/kernel/pci-gart_64.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b990b5cc9541..23824fef789c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -1162,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; - iommu_area_free(range->bitmap, address, pages); + bitmap_clear(range->bitmap, address, pages); } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index c563e4c8ff39..2bbde6078143 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include @@ -212,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - iommu_area_reserve(tbl->it_map, index, npages); + bitmap_set(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } @@ -303,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - iommu_area_free(tbl->it_map, entry, npages); + bitmap_clear(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 56c0e730d3fe..34de53b46f87 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -126,7 +126,7 @@ static void free_iommu(unsigned long offset, int size) unsigned long flags; spin_lock_irqsave(&iommu_bitmap_lock, flags); - iommu_area_free(iommu_gart_bitmap, offset, size); + bitmap_clear(iommu_gart_bitmap, offset, size); if (offset >= next_bit) next_bit = offset + size; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -792,7 +792,7 @@ int __init gart_iommu_init(void) * Out of IOMMU space handling. * Reserve some invalid pages at the beginning of the GART. */ - iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES); pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); -- cgit v1.2.2 From 853b3da10d617f08340e5fe569c99e7b54f2a568 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Dec 2009 00:34:13 -0500 Subject: sanitize do_pipe_flags() callers in arch * hpux_pipe() - no need to take BKL * sys32_pipe() in arch/x86/ia32 and xtensa_pipe() in arch/xtensa - no need at all, since both functions are open-coded sys_pipe() Signed-off-by: Al Viro --- arch/x86/include/asm/sys_ia32.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 4a5a089e1c62..d5f69045c100 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -30,7 +30,6 @@ struct mmap_arg_struct; asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); -asmlinkage long sys32_pipe(int __user *); struct sigaction32; struct old_sigaction32; asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, -- cgit v1.2.2 From 5714868812b563ba8816c1d974f4f07c76941c30 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 15 Dec 2009 10:19:50 +0900 Subject: PCI: fix section mismatch on update_res() Remark update_res from __init to __devinit as it is called also from __devinit functions. This patch removes the following warning message: WARNING: vmlinux.o(.devinit.text+0x774a): Section mismatch in reference from the function pci_root_bus_res() to the function .init.text:update_res() The function __devinit pci_root_bus_res() references a function __init update_res(). If update_res is only used by pci_root_bus_res then annotate update_res with a matching annotation. Signed-off-by: Jiri Slaby Cc: Aristeu Sergio Cc: Jesse Barnes Cc: linux-pci@vger.kernel.org Cc: x86@kernel.org Signed-off-by: Hidetoshi Seto Signed-off-by: Jesse Barnes --- arch/x86/pci/bus_numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 145df00e0387..f939d603adfa 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -51,7 +51,7 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) } } -void __init update_res(struct pci_root_info *info, size_t start, +void __devinit update_res(struct pci_root_info *info, size_t start, size_t end, unsigned long flags, int merge) { int i; -- cgit v1.2.2 From 9d260ebc09a0ad6b5c73e17676df42c7bc75ff64 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 16 Dec 2009 15:43:55 +0100 Subject: x86, amd: Get multi-node CPU info from NodeId MSR instead of PCI config space Use NodeId MSR to get NodeId and number of nodes per processor. Signed-off-by: Andreas Herrmann LKML-Reference: <20091216144355.GB28798@alberich.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/amd.c | 53 +++++++++++---------------------------- 3 files changed, 17 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 613700f27a4a..637e1ec963c3 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -153,6 +153,7 @@ #define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ +#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ /* * Auxiliary flags: Linux defined - For features scattered in various diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ac98d2914ebf..1cd58cdbc03f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -124,6 +124,7 @@ #define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff #define FAM10H_MMIO_CONF_BASE_SHIFT 20 +#define MSR_FAM10H_NODE_ID 0xc001100c /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8dc3ea145c97..e485825130d2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid) /* * Fixup core topology information for AMD multi-node processors. - * Assumption 1: Number of cores in each internal node is the same. - * Assumption 2: Mixed systems with both single-node and dual-node - * processors are not supported. + * Assumption: Number of cores in each internal node is the same. */ #ifdef CONFIG_X86_HT static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) { -#ifdef CONFIG_PCI - u32 t, cpn; - u8 n, n_id; + unsigned long long value; + u32 nodes, cores_per_node; int cpu = smp_processor_id(); + if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) + return; + /* fixup topology information only once for a core */ if (cpu_has(c, X86_FEATURE_AMD_DCM)) return; - /* check for multi-node processor on boot cpu */ - t = read_pci_config(0, 24, 3, 0xe8); - if (!(t & (1 << 29))) + rdmsrl(MSR_FAM10H_NODE_ID, value); + + nodes = ((value >> 3) & 7) + 1; + if (nodes == 1) return; set_cpu_cap(c, X86_FEATURE_AMD_DCM); + cores_per_node = c->x86_max_cores / nodes; - /* cores per node: each internal node has half the number of cores */ - cpn = c->x86_max_cores >> 1; + /* store NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = value & 7; - /* even-numbered NB_id of this dual-node processor */ - n = c->phys_proc_id << 1; - - /* - * determine internal node id and assign cores fifty-fifty to - * each node of the dual-node processor - */ - t = read_pci_config(0, 24 + n, 3, 0xe8); - n = (t>>30) & 0x3; - if (n == 0) { - if (c->cpu_core_id < cpn) - n_id = 0; - else - n_id = 1; - } else { - if (c->cpu_core_id < cpn) - n_id = 1; - else - n_id = 0; - } - - /* compute entire NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id; - - /* fixup core id to be in range from 0 to cpn */ - c->cpu_core_id = c->cpu_core_id % cpn; -#endif + /* fixup core id to be in range from 0 to (cores_per_node - 1) */ + c->cpu_core_id = c->cpu_core_id % cores_per_node; } #endif -- cgit v1.2.2 From 6ede31e03084ee084bcee073ef3d1136f68d0906 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 17 Dec 2009 00:16:25 +0100 Subject: x86, msr: msrs_alloc/free for CONFIG_SMP=n Randy Dunlap reported the following build error: "When CONFIG_SMP=n, CONFIG_X86_MSR=m: ERROR: "msrs_free" [drivers/edac/amd64_edac_mod.ko] undefined! ERROR: "msrs_alloc" [drivers/edac/amd64_edac_mod.ko] undefined!" This is due to the fact that is conditioned on CONFIG_SMP and in the UP case we have only the stubs in the header. Fork off SMP functionality into a new file (msr-smp.c) and build msrs_{alloc,free} unconditionally. Reported-by: Randy Dunlap Cc: H. Peter Anvin Signed-off-by: Borislav Petkov LKML-Reference: <20091216231625.GD27228@liondog.tnic> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 12 +++ arch/x86/lib/Makefile | 4 +- arch/x86/lib/msr-smp.c | 204 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/lib/msr.c | 213 --------------------------------------------- 4 files changed, 218 insertions(+), 215 deletions(-) create mode 100644 arch/x86/lib/msr-smp.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index cf985aa00660..c5bc4c2d33f5 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -27,6 +27,18 @@ struct msr { }; }; +struct msr_info { + u32 msr_no; + struct msr reg; + struct msr *msrs; + int err; +}; + +struct msr_regs_info { + u32 *regs; + int err; +}; + static inline unsigned long long native_read_tscp(unsigned int *aux) { unsigned long low, high; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index a2d6472895fb..706be8bf967b 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -14,7 +14,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c clean-files := inat-tables.c -obj-$(CONFIG_SMP) := msr.o +obj-$(CONFIG_SMP) += msr-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o @@ -22,7 +22,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-y += insn.o inat.o -obj-y += msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c new file mode 100644 index 000000000000..a6b1b86d2253 --- /dev/null +++ b/arch/x86/lib/msr-smp.c @@ -0,0 +1,204 @@ +#include +#include +#include +#include + +static void __rdmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = per_cpu_ptr(rv->msrs, this_cpu); + else + reg = &rv->reg; + + rdmsr(rv->msr_no, reg->l, reg->h); +} + +static void __wrmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = per_cpu_ptr(rv->msrs, this_cpu); + else + reg = &rv->reg; + + wrmsr(rv->msr_no, reg->l, reg->h); +} + +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err; +} +EXPORT_SYMBOL(rdmsr_on_cpu); + +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} +EXPORT_SYMBOL(wrmsr_on_cpu); + +static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, + struct msr *msrs, + void (*msr_func) (void *info)) +{ + struct msr_info rv; + int this_cpu; + + memset(&rv, 0, sizeof(rv)); + + rv.msrs = msrs; + rv.msr_no = msr_no; + + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + msr_func(&rv); + + smp_call_function_many(mask, msr_func, &rv, 1); + put_cpu(); +} + +/* rdmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); +} +EXPORT_SYMBOL(rdmsr_on_cpus); + +/* + * wrmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); +} +EXPORT_SYMBOL(wrmsr_on_cpus); + +/* These "safe" variants are slower and should be used when the target MSR + may not actually exist. */ +static void __rdmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); +} + +static void __wrmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); +} + +int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_on_cpu); + +int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_on_cpu); + +/* + * These variants are significantly slower, but allows control over + * the entire 32-bit GPR set. + */ +static void __rdmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = rdmsr_safe_regs(rv->regs); +} + +static void __wrmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = wrmsr_safe_regs(rv->regs); +} + +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); + +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 872834177937..8f8eebdca7d4 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -1,123 +1,7 @@ #include #include -#include #include -struct msr_info { - u32 msr_no; - struct msr reg; - struct msr *msrs; - int err; -}; - -static void __rdmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - struct msr *reg; - int this_cpu = raw_smp_processor_id(); - - if (rv->msrs) - reg = per_cpu_ptr(rv->msrs, this_cpu); - else - reg = &rv->reg; - - rdmsr(rv->msr_no, reg->l, reg->h); -} - -static void __wrmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - struct msr *reg; - int this_cpu = raw_smp_processor_id(); - - if (rv->msrs) - reg = per_cpu_ptr(rv->msrs, this_cpu); - else - reg = &rv->reg; - - wrmsr(rv->msr_no, reg->l, reg->h); -} - -int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); - *l = rv.reg.l; - *h = rv.reg.h; - - return err; -} -EXPORT_SYMBOL(rdmsr_on_cpu); - -int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - rv.reg.l = l; - rv.reg.h = h; - err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); - - return err; -} -EXPORT_SYMBOL(wrmsr_on_cpu); - -static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, - struct msr *msrs, - void (*msr_func) (void *info)) -{ - struct msr_info rv; - int this_cpu; - - memset(&rv, 0, sizeof(rv)); - - rv.msrs = msrs; - rv.msr_no = msr_no; - - this_cpu = get_cpu(); - - if (cpumask_test_cpu(this_cpu, mask)) - msr_func(&rv); - - smp_call_function_many(mask, msr_func, &rv, 1); - put_cpu(); -} - -/* rdmsr on a bunch of CPUs - * - * @mask: which CPUs - * @msr_no: which MSR - * @msrs: array of MSR values - * - */ -void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) -{ - __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); -} -EXPORT_SYMBOL(rdmsr_on_cpus); - -/* - * wrmsr on a bunch of CPUs - * - * @mask: which CPUs - * @msr_no: which MSR - * @msrs: array of MSR values - * - */ -void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) -{ - __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); -} -EXPORT_SYMBOL(wrmsr_on_cpus); - struct msr *msrs_alloc(void) { struct msr *msrs = NULL; @@ -137,100 +21,3 @@ void msrs_free(struct msr *msrs) free_percpu(msrs); } EXPORT_SYMBOL(msrs_free); - -/* These "safe" variants are slower and should be used when the target MSR - may not actually exist. */ -static void __rdmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); -} - -static void __wrmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); -} - -int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); - *l = rv.reg.l; - *h = rv.reg.h; - - return err ? err : rv.err; -} -EXPORT_SYMBOL(rdmsr_safe_on_cpu); - -int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - rv.reg.l = l; - rv.reg.h = h; - err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); - - return err ? err : rv.err; -} -EXPORT_SYMBOL(wrmsr_safe_on_cpu); - -/* - * These variants are significantly slower, but allows control over - * the entire 32-bit GPR set. - */ -struct msr_regs_info { - u32 *regs; - int err; -}; - -static void __rdmsr_safe_regs_on_cpu(void *info) -{ - struct msr_regs_info *rv = info; - - rv->err = rdmsr_safe_regs(rv->regs); -} - -static void __wrmsr_safe_regs_on_cpu(void *info) -{ - struct msr_regs_info *rv = info; - - rv->err = wrmsr_safe_regs(rv->regs); -} - -int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) -{ - int err; - struct msr_regs_info rv; - - rv.regs = regs; - rv.err = -EIO; - err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); - - return err ? err : rv.err; -} -EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); - -int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) -{ - int err; - struct msr_regs_info rv; - - rv.regs = regs; - rv.err = -EIO; - err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); - - return err ? err : rv.err; -} -EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); -- cgit v1.2.2 From 45a94d7cd45ed991914011919e7d40eb6d2546d1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 16 Dec 2009 16:25:42 -0800 Subject: x86, cpuid: Add "volatile" to asm in native_cpuid() xsave_cntxt_init() does something like: cpuid(0xd, ..); // find out what features FP/SSE/.. etc are supported xsetbv(); // enable the features known to OS cpuid(0xd, ..); // find out the size of the context for features enabled Depending on what features get enabled in xsetbv(), value of the cpuid.eax=0xd.ecx=0.ebx changes correspondingly (representing the size of the context that is enabled). As we don't have volatile keyword for native_cpuid(), gcc 4.1.2 optimizes away the second cpuid and the kernel continues to use the cpuid information obtained before xsetbv(), ultimately leading to kernel crash on processors supporting more state than the legacy FP/SSE. Add "volatile" for native_cpuid(). Signed-off-by: Suresh Siddha LKML-Reference: <1261009542.2745.55.camel@sbs-t61.sc.intel.com> Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6f8ec1c37e0a..fc801bab1b3b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -181,7 +181,7 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ - asm("cpuid" + asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), -- cgit v1.2.2 From 329962503692b42d8088f31584e42d52db179d52 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 15 Dec 2009 17:59:02 -0800 Subject: x86: Fix checking of SRAT when node 0 ram is not from 0 Found one system that boot from socket1 instead of socket0, SRAT get rejected... [ 0.000000] SRAT: Node 1 PXM 0 0-a0000 [ 0.000000] SRAT: Node 1 PXM 0 100000-80000000 [ 0.000000] SRAT: Node 1 PXM 0 100000000-2080000000 [ 0.000000] SRAT: Node 0 PXM 1 2080000000-4080000000 [ 0.000000] SRAT: Node 2 PXM 2 4080000000-6080000000 [ 0.000000] SRAT: Node 3 PXM 3 6080000000-8080000000 [ 0.000000] SRAT: Node 4 PXM 4 8080000000-a080000000 [ 0.000000] SRAT: Node 5 PXM 5 a080000000-c080000000 [ 0.000000] SRAT: Node 6 PXM 6 c080000000-e080000000 [ 0.000000] SRAT: Node 7 PXM 7 e080000000-10080000000 ... [ 0.000000] NUMA: Allocated memnodemap from 500000 - 701040 [ 0.000000] NUMA: Using 20 for the hash shift. [ 0.000000] Adding active range (0, 0x2080000, 0x4080000) 0 entries of 3200 used [ 0.000000] Adding active range (1, 0x0, 0x96) 1 entries of 3200 used [ 0.000000] Adding active range (1, 0x100, 0x7f750) 2 entries of 3200 used [ 0.000000] Adding active range (1, 0x100000, 0x2080000) 3 entries of 3200 used [ 0.000000] Adding active range (2, 0x4080000, 0x6080000) 4 entries of 3200 used [ 0.000000] Adding active range (3, 0x6080000, 0x8080000) 5 entries of 3200 used [ 0.000000] Adding active range (4, 0x8080000, 0xa080000) 6 entries of 3200 used [ 0.000000] Adding active range (5, 0xa080000, 0xc080000) 7 entries of 3200 used [ 0.000000] Adding active range (6, 0xc080000, 0xe080000) 8 entries of 3200 used [ 0.000000] Adding active range (7, 0xe080000, 0x10080000) 9 entries of 3200 used [ 0.000000] SRAT: PXMs only cover 917504MB of your 1048566MB e820 RAM. Not used. [ 0.000000] SRAT: SRAT not used. the early_node_map is not sorted because node0 with non zero start come first. so try to sort it right away after all regions are registered. also fixs refression by 8716273c (x86: Export srat physical topology) -v2: make it more solid to handle cross node case like node0 [0,4g), [8,12g) and node1 [4g, 8g), [12g, 16g) -v3: update comments. Reported-and-tested-by: Jens Axboe Signed-off-by: Yinghai Lu LKML-Reference: <4B2579D2.3010201@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/srat_32.c | 2 ++ arch/x86/mm/srat_64.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 6f8aa33031c7..9324f13492d5 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -267,6 +267,8 @@ int __init get_memcfg_from_srat(void) e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } + /* for out of order entries in SRAT */ + sort_node_map(); for_each_online_node(nid) { unsigned long start = node_start_pfn[nid]; diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index d89075489664..a27124185fc1 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -317,7 +317,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) unsigned long s = nodes[i].start >> PAGE_SHIFT; unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; - pxmram -= absent_pages_in_range(s, e); + pxmram -= __absent_pages_in_range(i, s, e); if ((long)pxmram < 0) pxmram = 0; } @@ -373,6 +373,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) for_each_node_mask(i, nodes_parsed) e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); + /* for out of order entries in SRAT */ + sort_node_map(); if (!nodes_cover_memory(nodes)) { bad_srat(); return -1; -- cgit v1.2.2 From 6a1e008a0915f502eb026fb995ea3e49d5b017f7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 15 Dec 2009 17:59:03 -0800 Subject: x86: Increase MAX_EARLY_RES; insufficient on 32-bit NUMA Due to recent changes wakeup and mptable, we run out of early reservations on 32-bit NUMA. Thus, adjust the available number. Signed-off-by: Yinghai Lu LKML-Reference: <4B22D754.2020706@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f50447d961c0..05ed7ab2ca48 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -724,7 +724,7 @@ core_initcall(e820_mark_nvs_memory); /* * Early reserved memory areas. */ -#define MAX_EARLY_RES 20 +#define MAX_EARLY_RES 32 struct early_res { u64 start, end; -- cgit v1.2.2 From a4636818f8e0991f32d9528f39cf4f3d6a7d30a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 17 Dec 2009 11:43:29 -0600 Subject: cpumask: rename tsk_cpumask to tsk_cpus_allowed Noone uses this wrapper yet, and Ingo asked that it be kept consistent with current task_struct usage. (One user crept in via linux-next: fixed) Signed-off-by: Rusty Russell Cc: Tejun Heo --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a9df9441a9a2..f125e5c551c0 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1136,7 +1136,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) return -ENOMEM; - cpumask_copy(oldmask, tsk_cpumask(current)); + cpumask_copy(oldmask, tsk_cpus_allowed(current)); set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); if (smp_processor_id() != pol->cpu) { -- cgit v1.2.2 From 61c1917f47f73c968e92d04d15370b1dc3ec4592 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Dec 2009 05:40:33 +0100 Subject: perf events, x86/stacktrace: Make stack walking optional The current print_context_stack helper that does the stack walking job is good for usual stacktraces as it walks through all the stack and reports even addresses that look unreliable, which is nice when we don't have frame pointers for example. But we have users like perf that only require reliable stacktraces, and those may want a more adapted stack walker, so lets make this function a callback in stacktrace_ops that users can tune for their needs. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1261024834-5336-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 18 ++++++++++++++++++ arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/dumpstack.c | 9 +++++---- arch/x86/kernel/dumpstack.h | 6 ------ arch/x86/kernel/dumpstack_32.c | 2 +- arch/x86/kernel/dumpstack_64.c | 4 ++-- arch/x86/kernel/stacktrace.c | 18 ++++++++++-------- arch/x86/oprofile/backtrace.c | 9 +++++---- 8 files changed, 42 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index cf86a5e73815..6c75151a3cca 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -5,6 +5,23 @@ extern int kstack_depth_to_print; int x86_is_stack_id(int id, char *name); +struct thread_info; +struct stacktrace_ops; + +typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo, + unsigned long *stack, + unsigned long bp, + const struct stacktrace_ops *ops, + void *data, + unsigned long *end, + int *graph); + +extern unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); + /* Generic stack tracer with callbacks */ struct stacktrace_ops { @@ -14,6 +31,7 @@ struct stacktrace_ops { void (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ int (*stack)(void *data, char *name); + walk_stack_t walk_stack; }; void dump_trace(struct task_struct *tsk, struct pt_regs *regs, diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 45506d5dd8df..d3802ee5a416 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2336,6 +2336,7 @@ static const struct stacktrace_ops backtrace_ops = { .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, + .walk_stack = print_context_stack, }; #include "../dumpstack.h" diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0a0aa1cec8f1..8aaa119b7cad 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -141,10 +141,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, + .walk_stack = print_context_stack, }; void diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 81086c227ab7..4fd1420faffa 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -14,12 +14,6 @@ #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif -extern unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); - extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0ed4c7abb62..ae775ca47b25 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -58,7 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index b13af53883aa..0ad9597073f5 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -188,8 +188,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, id) < 0) break; - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, + data, estack_end, &graph); ops->stack(data, ""); /* * We link to the next stack via the diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index c3eb207181fe..922eefbb3f6c 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops save_stack_ops = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address, + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address, + .walk_stack = print_context_stack, }; static const struct stacktrace_ops save_stack_ops_nosched = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address_nosched, + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address_nosched, + .walk_stack = print_context_stack, }; /* diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 044897be021f..3855096c59b8 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -41,10 +41,11 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } static struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, + .walk_stack = print_context_stack, }; struct frame_head { -- cgit v1.2.2 From 06d65bda75341485d32f33da474b0664819ad497 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Dec 2009 05:40:34 +0100 Subject: perf events, x86/stacktrace: Fix performance/softlockup by providing a special frame pointer-only stack walker It's just wasteful for stacktrace users like perf to walk through every entries on the stack whereas these only accept reliable ones, ie: that the frame pointer validates. Since perf requires pure reliable stacktraces, it needs a stack walker based on frame pointers-only to optimize the stacktrace processing. This might solve some near-lockup scenarios that can be triggered by call-graph tracing timer events. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1261024834-5336-2-git-send-regression-fweisbec@gmail.com> [ v2: fix for modular builds and small detail tidyup ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 6 ++++++ arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/dumpstack.c | 28 ++++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 6c75151a3cca..35e89122a42f 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -22,6 +22,12 @@ print_context_stack(struct thread_info *tinfo, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph); +extern unsigned long +print_context_stack_bp(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); + /* Generic stack tracer with callbacks */ struct stacktrace_ops { diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d3802ee5a416..c223b7e895d9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2336,7 +2336,7 @@ static const struct stacktrace_ops backtrace_ops = { .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, - .walk_stack = print_context_stack, + .walk_stack = print_context_stack_bp, }; #include "../dumpstack.h" diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 8aaa119b7cad..c56bc2873030 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -109,6 +109,30 @@ print_context_stack(struct thread_info *tinfo, } return bp; } +EXPORT_SYMBOL_GPL(print_context_stack); + +unsigned long +print_context_stack_bp(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long *ret_addr = &frame->return_address; + + while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { + unsigned long addr = *ret_addr; + + if (__kernel_text_address(addr)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + } + return (unsigned long)frame; +} +EXPORT_SYMBOL_GPL(print_context_stack_bp); static void @@ -143,8 +167,8 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) static const struct stacktrace_ops print_trace_ops = { .warning = print_trace_warning, .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .stack = print_trace_stack, + .address = print_trace_address, .walk_stack = print_context_stack, }; -- cgit v1.2.2 From 4beb3d6d144c41525541cce2b611858b2645c725 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 16 Dec 2009 17:39:48 -0800 Subject: x86: Don't use POSIX character classes in gen-insn-attr-x86.awk Not all awk implementations (including the default awk in Ubuntu 9.10) support POSIX character classes. Since x86-opcode-map.txt is plain ASCII, we can just use explicit ranges for lower case, alphabetic, and alphanumeric characters instead. Signed-off-by: Roland Dreier Acked-by: Masami Hiramatsu LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/tools/gen-insn-attr-x86.awk | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 7a6850683c34..eaf11f52fc0b 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -6,8 +6,6 @@ # Awk implementation sanity check function check_awk_implement() { - if (!match("abc", "[[:lower:]]+")) - return "Your awk doesn't support charactor-class." if (sprintf("%x", 0) != "0") return "Your awk has a printf-format problem." return "" @@ -44,12 +42,12 @@ BEGIN { delete gtable delete atable - opnd_expr = "^[[:alpha:]/]" + opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" sep_expr = "^\\|$" - group_expr = "^Grp[[:alnum:]]+" + group_expr = "^Grp[0-9A-Za-z]+" - imm_expr = "^[IJAO][[:lower:]]" + imm_expr = "^[IJAO][a-z]" imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" @@ -62,7 +60,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" - modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])" + modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" rex_expr = "^REX(\\.[XRWB]+)*" fpu_expr = "^ESC" # TODO -- cgit v1.2.2 From 04a1e62c2cec820501f93526ad1e46073b802dc4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2009 07:04:56 -0800 Subject: x86/ptrace: make genregs[32]_get/set more robust The loop condition is fragile: we compare an unsigned value to zero, and then decrement it by something larger than one in the loop. All the callers should be passing in appropriately aligned buffer lengths, but it's better to just not rely on it, and have some appropriate defensive loop limits. Acked-by: Roland McGrath Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2779321046bd..017d937639fe 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -509,14 +509,14 @@ static int genregs_get(struct task_struct *target, { if (kbuf) { unsigned long *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { *k++ = getreg(target, pos); count -= sizeof(*k); pos += sizeof(*k); } } else { unsigned long __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { if (__put_user(getreg(target, pos), u++)) return -EFAULT; count -= sizeof(*u); @@ -535,14 +535,14 @@ static int genregs_set(struct task_struct *target, int ret = 0; if (kbuf) { const unsigned long *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) @@ -1458,14 +1458,14 @@ static int genregs32_get(struct task_struct *target, { if (kbuf) { compat_ulong_t *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { getreg32(target, pos, k++); count -= sizeof(*k); pos += sizeof(*k); } } else { compat_ulong_t __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { compat_ulong_t word; getreg32(target, pos, &word); if (__put_user(word, u++)) @@ -1486,14 +1486,14 @@ static int genregs32_set(struct task_struct *target, int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) -- cgit v1.2.2 From b76365a18f7593c9df32a74bf2b4a39741b67bc6 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Thu, 17 Dec 2009 10:53:25 -0600 Subject: x86, uv: Add serial number parameter to uv_bios_get_sn_info() Add system_serial_number to the information returned by uv_bios_get_sn_info() UV BIOS call. Signed-off-by: Russ Anderson LKML-Reference: <20091217165323.GA30774@sgi.com> Cc: Jack Steiner Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 7 ++++--- arch/x86/kernel/apic/x2apic_uv_x.c | 6 +++--- arch/x86/kernel/bios_uv.c | 20 ++++++++++++++------ 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 2751f3075d8b..3fbc1f348a7d 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -18,8 +18,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ #include @@ -89,7 +89,7 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); +extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, unsigned long *); @@ -104,6 +104,7 @@ extern int uv_type; extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; +extern long system_serial_number; #define partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b684bb303cbf..af5d103bb533 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. */ #include #include @@ -627,8 +627,8 @@ void __init uv_system_init(void) } uv_bios_init(); - uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &sn_coherency_id, &sn_region_size); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, + &sn_region_size, &system_serial_number); uv_rtc_init(); for_each_present_cpu(cpu) { diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index b0206a211b09..c918ebab52ab 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -15,8 +15,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ #include @@ -30,6 +30,7 @@ static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { struct uv_systab *tab = &uv_systab; + s64 ret; if (!tab->function) /* @@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - return efi_call6((void *)__va(tab->function), - (u64)which, a1, a2, a3, a4, a5); + ret = efi_call6((void *)__va(tab->function), (u64)which, + a1, a2, a3, a4, a5); + return ret; } +EXPORT_SYMBOL_GPL(uv_bios_call); s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) @@ -73,11 +76,14 @@ long sn_coherency_id; EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); +long system_serial_number; +EXPORT_SYMBOL_GPL(system_serial_number); int uv_type; +EXPORT_SYMBOL_GPL(uv_type); s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, - long *region) + long *region, long *ssn) { s64 ret; u64 v0, v1; @@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, *coher = part.coherence_id; if (region) *region = part.region_size; + if (ssn) + *ssn = v1; return ret; } +EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); int uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, @@ -185,4 +194,3 @@ void uv_bios_init(void) void uv_bios_init(void) { } #endif - -- cgit v1.2.2 From 6c56ccecf05fafe100ab4ea94f6fccbf5ff00db7 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 17 Dec 2009 12:27:02 -0800 Subject: x86: Reenable TSC sync check at boot, even with NONSTOP_TSC Commit 83ce4009 did the following change If the TSC is constant and non-stop, also set it reliable. But, there seems to be few systems that will end up with TSC warp across sockets, depending on how the cpus come out of reset. Skipping TSC sync test on such systems may result in time inconsistency later. So, reenable TSC sync test even on constant and non-stop TSC systems. Set, sched_clock_stable to 1 by default and reset it in mark_tsc_unstable, if TSC sync fails. This change still gives perf benefit mentioned in 83ce4009 for systems where TSC is reliable. Signed-off-by: Venkatesh Pallipadi Acked-by: Suresh Siddha LKML-Reference: <20091217202702.GA18015@linux-os.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel.c | 1 - arch/x86/kernel/tsc.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9c31e8b09d2c..879666f4d871 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); sched_clock_stable = 1; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cd982f48e23e..597683aa5ba0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; + sched_clock_stable = 0; printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) -- cgit v1.2.2 From 8c63450718ea62ee3a70bffde170b4d15fc72d3c Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Thu, 17 Dec 2009 15:26:36 -0800 Subject: x86: Fix objdump version check in arch/x86/tools/chkobjdump.awk It says Warning: objdump version is older than 2.19 Warning: Skipping posttest. because it used the wrong field from `objdump -v': akpm:/usr/src/25> /opt/crosstool/gcc-4.0.2-glibc-2.3.6/x86_64-unknown-linux-gnu/bin/x86_64-unknown-linux-gnu-objdump -v GNU objdump 2.16.1 Copyright 2005 Free Software Foundation, Inc. This program is free software; you may redistribute it under the terms of the GNU General Public License. This program has absolutely no warranty. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton LKML-Reference: <200912172326.nBHNQaQl024796@imap1.linux-foundation.org> Signed-off-by: H. Peter Anvin Cc: Masami Hiramatsu --- arch/x86/tools/chkobjdump.awk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk index 0d13cd9fdcff..5bbb5a33f220 100644 --- a/arch/x86/tools/chkobjdump.awk +++ b/arch/x86/tools/chkobjdump.awk @@ -9,7 +9,7 @@ BEGIN { } /^GNU/ { - split($4, ver, "."); + split($3, ver, "."); if (ver[1] > od_ver || (ver[1] == od_ver && ver[2] >= od_sver)) { exit 1; -- cgit v1.2.2 From 18374d89e5fe96772102f44f535efb1198d9be08 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 17 Dec 2009 18:29:46 -0800 Subject: x86, irq: Allow 0xff for /proc/irq/[n]/smp_affinity on an 8-cpu system John Blackwood reported: > on an older Dell PowerEdge 6650 system with 8 cpus (4 are hyper-threaded), > and 32 bit (x86) kernel, once you change the irq smp_affinity of an irq > to be less than all cpus in the system, you can never change really the > irq smp_affinity back to be all cpus in the system (0xff) again, > even though no error status is returned on the "/bin/echo ff > > /proc/irq/[n]/smp_affinity" operation. > > This is due to that fact that BAD_APICID has the same value as > all cpus (0xff) on 32bit kernels, and thus the value returned from > set_desc_affinity() via the cpu_mask_to_apicid_and() function is treated > as a failure in set_ioapic_affinity_irq_desc(), and no affinity changes > are made. set_desc_affinity() is already checking if the incoming cpu mask intersects with the cpu online mask or not. So there is no need for the apic op cpu_mask_to_apicid_and() to check again and return BAD_APICID. Remove the BAD_APICID return value from cpu_mask_to_apicid_and() and also fix set_desc_affinity() to return -1 instead of using BAD_APICID to represent error conditions (as cpu_mask_to_apicid_and() can return logical or physical apicid values and BAD_APICID is really to represent bad physical apic id). Reported-by: John Blackwood Root-caused-by: John Blackwood Signed-off-by: Suresh Siddha LKML-Reference: <1261103386.2535.409.camel@sbs-t61> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hw_irq.h | 3 ++- arch/x86/kernel/apic/apic_flat_64.c | 5 +---- arch/x86/kernel/apic/bigsmp_32.c | 5 +---- arch/x86/kernel/apic/io_apic.c | 32 ++++++++++++++------------------ arch/x86/kernel/apic/x2apic_cluster.c | 5 +---- arch/x86/kernel/apic/x2apic_phys.c | 5 +---- arch/x86/kernel/apic/x2apic_uv_x.c | 5 +---- arch/x86/kernel/uv_irq.c | 3 +-- 8 files changed, 22 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 08c48a81841f..eeac829a0f44 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -103,7 +103,8 @@ extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); extern void send_cleanup_vector(struct irq_cfg *); struct irq_desc; -extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *); +extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *, + unsigned int *dest_id); extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index d0c99abc26c3..eacbd2b31d27 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -306,10 +306,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } struct apic apic_physflat = { diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 38dcecfa5818..cb804c5091b9 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -131,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return bigsmp_cpu_to_logical_apicid(cpu); - - return BAD_APICID; + return bigsmp_cpu_to_logical_apicid(cpu); } static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d5d498fbee4b..98ced709e829 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2276,26 +2276,28 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq /* * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and * leaves desc->affinity untouched. */ unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, + unsigned int *dest_id) { struct irq_cfg *cfg; unsigned int irq; if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; + return -1; irq = desc->irq; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; + return -1; cpumask_copy(desc->affinity, mask); - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + return 0; } static int @@ -2311,12 +2313,11 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) cfg = desc->chip_data; spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { + ret = set_desc_affinity(desc, mask, &dest); + if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); - ret = 0; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -3351,8 +3352,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3384,8 +3384,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) if (get_irte(irq, &irte)) return -1; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; irte.vector = cfg->vector; @@ -3567,8 +3566,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3623,8 +3621,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3730,8 +3727,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irq_cfg *cfg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index a5371ec36776..cf69c59f4910 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_logical_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_logical_apicid, cpu); } static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a8989aadc99a..8972f38c5ced 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } static unsigned int x2apic_phys_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b684bb303cbf..d56b0efb2057 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -225,10 +225,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } static unsigned int x2apic_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 61d805df4c91..ece73d8e3240 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -215,8 +215,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) unsigned long mmr_offset; unsigned mmr_pnode; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; mmr_value = 0; -- cgit v1.2.2 From 99e8c5a3b875a34d894a711c9a3669858d6adf45 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Dec 2009 01:33:54 +0100 Subject: hw-breakpoints: Fix hardware breakpoints -> perf events dependency The kbuild's select command doesn't propagate through the config dependencies. Hence the current rules of hardware breakpoint's config can't ensure perf can never be disabled under us. We have: config X86 selects HAVE_HW_BREAKPOINTS config HAVE_HW_BREAKPOINTS select PERF_EVENTS config PERF_EVENTS [...] x86 will select the breakpoints but that won't propagate to perf events. The user can still disable the latter, but it is necessary for the breakpoints. What we need is: - x86 selects HAVE_HW_BREAKPOINTS and PERF_EVENTS - HAVE_HW_BREAKPOINTS depends on PERF_EVENTS so that we ensure PERF_EVENTS is enabled and frozen for x86. This fixes the following kind of build errors: In file included from arch/x86/kernel/hw_breakpoint.c:31: include/linux/hw_breakpoint.h: In function 'hw_breakpoint_addr': include/linux/hw_breakpoint.h:39: error: 'struct perf_event' has no member named 'attr' v2: Select also ANON_INODES from x86, required for perf Reported-by: Cyrill Gorcunov Reported-by: Michal Marek Reported-by: Andrew Randrianasulu Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Randy Dunlap Cc: K.Prasad LKML-Reference: <1261010034-7786-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3b2a5aca4edb..55298e891571 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -50,6 +50,8 @@ config X86 select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA select HAVE_HW_BREAKPOINT + select PERF_EVENTS + select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER -- cgit v1.2.2 From 8bee738bb1979c8bf7b42716b772522ab7d26b0c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 18 Dec 2009 10:40:13 -0500 Subject: x86: Fix objdump version check in chkobjdump.awk for different formats. Different version of objdump says its version in different way; GNU objdump 2.16.1 or GNU objdump version 2.19.51.0.14-1.fc11 20090722 This patch uses the first argument which starts with a number as version string. Changes in v2: - Remove unneeded increment. Signed-off-by: Masami Hiramatsu LKML-Reference: <20091218154012.16960.5113.stgit@dhcp-100-2-132.bos.redhat.com> Suggested-by: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/tools/chkobjdump.awk | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk index 5bbb5a33f220..fd1ab80be0de 100644 --- a/arch/x86/tools/chkobjdump.awk +++ b/arch/x86/tools/chkobjdump.awk @@ -8,14 +8,24 @@ BEGIN { od_sver = 19; } -/^GNU/ { - split($3, ver, "."); +/^GNU objdump/ { + verstr = "" + for (i = 3; i <= NF; i++) + if (match($(i), "^[0-9]")) { + verstr = $(i); + break; + } + if (verstr == "") { + printf("Warning: Failed to find objdump version number.\n"); + exit 0; + } + split(verstr, ver, "."); if (ver[1] > od_ver || (ver[1] == od_ver && ver[2] >= od_sver)) { exit 1; } else { printf("Warning: objdump version %s is older than %d.%d\n", - $4, od_ver, od_sver); + verstr, od_ver, od_sver); print("Warning: Skipping posttest."); # Logic is inverted, because we just skip test without error. exit 0; -- cgit v1.2.2 From 0f764806438d5576ac58898332e5dcf30bb8a679 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Dec 2009 15:51:23 +0100 Subject: x86/amd-iommu: Fix initialization failure panic The assumption that acpi_table_parse passes the return value of the hanlder function to the caller proved wrong recently. The return value of the handler function is totally ignored. This makes the initialization code for AMD IOMMU buggy in a way that could cause a kernel panic on initialization. This patch fixes the issue in the AMD IOMMU driver. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 1dca9c34eaeb..fb490ce7dd55 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -137,6 +137,11 @@ int amd_iommus_present; /* IOMMUs have a non-present cache? */ bool amd_iommu_np_cache __read_mostly; +/* + * Set to true if ACPI table parsing and hardware intialization went properly + */ +static bool amd_iommu_initialized; + /* * List of protection domains - used during resume */ @@ -929,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table) } WARN_ON(p != end); + amd_iommu_initialized = true; + return 0; } @@ -1263,6 +1270,9 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; + if (!amd_iommu_initialized) + goto free; + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; -- cgit v1.2.2 From 1d9cb470a755409ce97c3376174b1e234bd20371 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:14 -0700 Subject: ACPI: processor: introduce arch_has_acpi_pdc arch dependent helper function that tells us if we should attempt to evaluate _PDC on this machine or not. The x86 implementation assumes that the CPUs in the machine must be homogeneous, and that you cannot mix CPUs of different vendors. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/include/asm/acpi.h | 7 +++++++ arch/x86/kernel/acpi/processor.c | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 60d2b2db0bc5..d787e6e92bd1 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -142,6 +142,13 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) return max_cstate; } +static inline bool arch_has_acpi_pdc(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + return (c->x86_vendor == X86_VENDOR_INTEL || + c->x86_vendor == X86_VENDOR_CENTAUR); +} + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d85d1b2432ba..bcb6efe08c5d 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -79,9 +79,7 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) struct cpuinfo_x86 *c = &cpu_data(pr->id); pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL || - c->x86_vendor == X86_VENDOR_CENTAUR) - init_intel_pdc(pr, c); + init_intel_pdc(pr, c); return; } -- cgit v1.2.2 From 407cd87c54e76c266245e8faef8dd4a84b7254fe Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:19 -0700 Subject: ACPI: processor: unify arch_acpi_processor_init_pdc The x86 and ia64 implementations of arch_acpi_processor_init_pdc() are almost exactly the same. The only difference is in what bits they set in obj_list buffer. Combine the boilerplate memory management code, and leave the arch-specific bit twiddling in separate implementations. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/processor.c | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index bcb6efe08c5d..967860b43f2a 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -14,31 +14,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) { - struct acpi_object_list *obj_list; - union acpi_object *obj; - u32 *buf; - - /* allocate and initialize pdc. It will be used later. */ - obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL); - if (!obj_list) { - printk(KERN_ERR "Memory allocation error\n"); - return; - } - - obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL); - if (!obj) { - printk(KERN_ERR "Memory allocation error\n"); - kfree(obj_list); - return; - } - - buf = kmalloc(12, GFP_KERNEL); - if (!buf) { - printk(KERN_ERR "Memory allocation error\n"); - kfree(obj); - kfree(obj_list); - return; - } + u32 *buf = (u32 *)pr->pdc->pointer->buffer.pointer; buf[0] = ACPI_PDC_REVISION_ID; buf[1] = 1; @@ -62,13 +38,6 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MWAIT)) buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); - obj->type = ACPI_TYPE_BUFFER; - obj->buffer.length = 12; - obj->buffer.pointer = (u8 *) buf; - obj_list->count = 1; - obj_list->pointer = obj; - pr->pdc = obj_list; - return; } @@ -78,7 +47,6 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) { struct cpuinfo_x86 *c = &cpu_data(pr->id); - pr->pdc = NULL; init_intel_pdc(pr, c); return; -- cgit v1.2.2 From 08ea48a326d8030ef5b7fb02292faf5a53c95e0a Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:24 -0700 Subject: ACPI: processor: factor out common _PDC settings Both x86 and ia64 initialize _PDC with mostly common bit settings. Factor out the common settings and leave the arch-specific ones alone. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/processor.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index 967860b43f2a..d722ca8cb4c7 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -16,16 +16,8 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) { u32 *buf = (u32 *)pr->pdc->pointer->buffer.pointer; - buf[0] = ACPI_PDC_REVISION_ID; - buf[1] = 1; - buf[2] = ACPI_PDC_C_CAPABILITY_SMP; + buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; - /* - * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so - * that OSPM is capable of native ACPI throttling software - * coordination using BIOS supplied _TSD info. - */ - buf[2] |= ACPI_PDC_SMP_T_SWCOORD; if (cpu_has(c, X86_FEATURE_EST)) buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; -- cgit v1.2.2 From 6c5807d7bc7d051fce00863ffb98d36325501eb2 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:29 -0700 Subject: ACPI: processor: finish unifying arch_acpi_processor_init_pdc() The only thing arch-specific about calling _PDC is what bits get set in the input obj_list buffer. There's no need for several levels of indirection to twiddle those bits. Additionally, since we're just messing around with a buffer, we can simplify the interface; no need to pass around the entire struct acpi_processor * just to get at the buffer. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/include/asm/acpi.h | 19 +++++++++++++++++++ arch/x86/kernel/acpi/processor.c | 34 ---------------------------------- 2 files changed, 19 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index d787e6e92bd1..56f462cf22d2 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -149,6 +149,25 @@ static inline bool arch_has_acpi_pdc(void) c->x86_vendor == X86_VENDOR_CENTAUR); } +static inline void arch_acpi_set_pdc_bits(u32 *buf) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; + + if (cpu_has(c, X86_FEATURE_EST)) + buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; + + if (cpu_has(c, X86_FEATURE_ACPI)) + buf[2] |= ACPI_PDC_T_FFH; + + /* + * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) + buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); +} + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d722ca8cb4c7..0f57307f8224 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -12,40 +12,6 @@ #include #include -static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) -{ - u32 *buf = (u32 *)pr->pdc->pointer->buffer.pointer; - - buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; - - if (cpu_has(c, X86_FEATURE_EST)) - buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; - - if (cpu_has(c, X86_FEATURE_ACPI)) - buf[2] |= ACPI_PDC_T_FFH; - - /* - * If mwait/monitor is unsupported, C2/C3_FFH will be disabled - */ - if (!cpu_has(c, X86_FEATURE_MWAIT)) - buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); - - return; -} - - -/* Initialize _PDC data based on the CPU vendor */ -void arch_acpi_processor_init_pdc(struct acpi_processor *pr) -{ - struct cpuinfo_x86 *c = &cpu_data(pr->id); - - init_intel_pdc(pr, c); - - return; -} - -EXPORT_SYMBOL(arch_acpi_processor_init_pdc); - void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) { if (pr->pdc) { -- cgit v1.2.2 From 47817254b8637b56730aec26eed2c337d3938bb5 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:34 -0700 Subject: ACPI: processor: unify arch_acpi_processor_cleanup_pdc The x86 and ia64 implementations of the function in $subject are exactly the same. Also, since the arch-specific implementations of setting _PDC have been completely hollowed out, remove the empty shells. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/Makefile | 2 +- arch/x86/kernel/acpi/processor.c | 25 ------------------------- 2 files changed, 1 insertion(+), 26 deletions(-) delete mode 100644 arch/x86/kernel/acpi/processor.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index fd5ca97a2ad5..6f35260bb3ef 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) -obj-y += cstate.o processor.o +obj-y += cstate.o endif $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c deleted file mode 100644 index 0f57307f8224..000000000000 --- a/arch/x86/kernel/acpi/processor.c +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) 2005 Intel Corporation - * Venkatesh Pallipadi - * - Added _PDC for platforms with Intel CPUs - */ - -#include -#include -#include -#include - -#include -#include - -void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) -{ - if (pr->pdc) { - kfree(pr->pdc->pointer->buffer.pointer); - kfree(pr->pdc->pointer); - kfree(pr->pdc); - pr->pdc = NULL; - } -} - -EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc); -- cgit v1.2.2 From 4a28395d72a956f2dad24e343d06bc08c9afb89a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 21 Dec 2009 16:19:58 -0800 Subject: arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c: avoid cross-CPU interrupts by using smp_call_function_any() Presently acpi-cpufreq will perform the MSR read on the first CPU in the mask. That's inefficient if that CPU differs from the current CPU. Because we have to perform a cross-CPU call, but we could have run the rdmsr on the current CPU. So switch to using the new smp_call_function_any(), which will perform the call on the current CPU if that CPU is present in the mask (it is). Cc: "Zhang, Yanmin" Cc: Dave Jones Cc: Ingo Molnar Cc: Jaswinder Singh Rajput Cc: Len Brown Cc: Mike Galbraith Cc: Rusty Russell Cc: Thomas Gleixner Cc: Venkatesh Pallipadi Cc: Zhao Yakui Signed-off-by: Andrew Morton Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index f28decf8dde3..1b1920fa7c80 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -190,9 +190,11 @@ static void do_drv_write(void *_cmd) static void drv_read(struct drv_cmd *cmd) { + int err; cmd->val = 0; - smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); + err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); + WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */ } static void drv_write(struct drv_cmd *cmd) -- cgit v1.2.2 From 2f99f5c8f05e02f3df1bb4d93b6704e6f5972872 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 23 Dec 2009 15:04:53 -0800 Subject: Revert "x86, ucode-amd: Ensure ucode update on suspend/resume after CPU off/online cycle" This reverts commit 9f15226e75583547aaf542c6be4bdac1060dd425. It's just wrong, and broke resume for Rafael even on a non-AMD CPU. As Rafael says: "... it causes microcode_init_cpu() to be called during resume even for CPUs for which there's no microcode to apply. That, in turn, results in executing request_firmware() (on Intel CPUs at least) which doesn't work at this stage of resume (we have device interrupts disabled, I/O devices are still suspended and so on). If I'm not mistaken, the "if (uci->valid)" logic means "if that CPU is known to us" , so before commit 9f15226e755 microcode_resume_cpu() was called for all CPUs already in the system during suspend, which was the right thing to do. The commit changed it so that the CPUs without microcode to apply are now treated as "unknown", which is not quite right. The problem this commit attempted to solve has to be handled differently." Bisected-and -requested-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- arch/x86/kernel/microcode_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 844c02c65fcb..0c8632433090 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -394,7 +394,7 @@ static enum ucode_state microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; enum ucode_state ustate; - if (uci->valid && uci->mc) + if (uci->valid) ustate = microcode_resume_cpu(cpu); else ustate = microcode_init_cpu(cpu); -- cgit v1.2.2 From 17a2a9b57a9a7d2fd8f97df951b5e63e0bd56ef5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 25 Dec 2009 15:40:38 -0800 Subject: x86, compress: Force i386 instructions for the decompressor Recently, some distros have started shipping versions of gcc which default to -march=i686. This breaks building kernels for pre-i686 machines, even if they have been selected in Kconfig, due to the generation of CMOV instructions. There isn't enough benefit to try to preserve the generation of these instructions even when selected, so simply force -march=i386 for the decompressor when building a 32-bit kernel. Reported-and-tested-by: Chris Rankin Signed-off-by: H. Peter Anvin LKML-Reference: <219280.97558.qm@web52907.mail.re2.yahoo.com> --- arch/x86/boot/compressed/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f8ed0658404c..f25bbd37765a 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -9,6 +9,7 @@ targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinu KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += $(call cc-option,-ffreestanding) -- cgit v1.2.2 From fb341f572d26e0786167cd96b90cc4febed830cf Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 5 Dec 2009 12:34:11 -0200 Subject: KVM: MMU: remove prefault from invlpg handler The invlpg prefault optimization breaks Windows 2008 R2 occasionally. The visible effect is that the invlpg handler instantiates a pte which is, microseconds later, written with a different gfn by another vcpu. The OS could have other mechanisms to prevent a present translation from being used, which the hypervisor is unaware of. While the documentation states that the cpu is at liberty to prefetch tlb entries, it looks like this is not heeded, so remove tlb prefetch from invlpg. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a6017132fba8..58a0f1e88596 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -455,8 +455,6 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; - pt_element_t gpte; - gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -470,10 +468,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (level == PT_PAGE_TABLE_LEVEL || ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - pte_gpa = (sp->gfn << PAGE_SHIFT); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -492,18 +486,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) - return; - if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, - sizeof(pt_element_t), 0); - } } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) -- cgit v1.2.2 From 6e24a6eff4571002cd48b99a2b92dc829ce39cb9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 14 Dec 2009 17:37:35 -0200 Subject: KVM: LAPIC: make sure IRR bitmap is scanned after vm load The vcpus are initialized with irr_pending set to false, but loading the LAPIC registers with pending IRR fails to reset the irr_pending variable. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cd60c0bd1b32..3063a0c4858b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1150,6 +1150,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); + apic->irr_pending = true; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From dab4b911a5327859bb8f969249c6978c26cd4853 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 6 Dec 2009 18:24:15 +0100 Subject: KVM: x86: Extend KVM_SET_VCPU_EVENTS with selective updates User space may not want to overwrite asynchronously changing VCPU event states on write-back. So allow to skip nmi.pending and sipi_vector by setting corresponding bits in the flags field of kvm_vcpu_events. [avi: advertise the bits in KVM_GET_VCPU_EVENTS] Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 4 ++++ arch/x86/kvm/x86.c | 12 ++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 950df434763f..f46b79f6c16c 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -254,6 +254,10 @@ struct kvm_reinject_control { __u8 reserved[31]; }; +/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ +#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 +#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 + /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { struct { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d068966fb2a..6651dbf58675 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1913,7 +1913,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->sipi_vector = vcpu->arch.sipi_vector; - events->flags = 0; + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR); vcpu_put(vcpu); } @@ -1921,7 +1922,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - if (events->flags) + if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) return -EINVAL; vcpu_load(vcpu); @@ -1938,10 +1940,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_pic_clear_isr_ack(vcpu->kvm); vcpu->arch.nmi_injected = events->nmi.injected; - vcpu->arch.nmi_pending = events->nmi.pending; + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + vcpu->arch.nmi_pending = events->nmi.pending; kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); - vcpu->arch.sipi_vector = events->sipi_vector; + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) + vcpu->arch.sipi_vector = events->sipi_vector; vcpu_put(vcpu); -- cgit v1.2.2 From fd2a50a0240f5f5b59070474eabd83a85720a406 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Thu, 24 Dec 2009 01:54:47 +0000 Subject: x86, perfctr: Remove unused func avail_to_resrv_perfctr_nmi() avail_to_resrv_perfctr_nmi() is neither EXPORT'd, nor used in the file. So remove it. Signed-off-by: Naga Chumbalkar Acked-by: Cyrill Gorcunov Cc: oprofile-list@lists.sf.net LKML-Reference: <20091224015441.6005.4408.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 1 - arch/x86/kernel/cpu/perfctr-watchdog.c | 11 ----------- 2 files changed, 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 139d4c1a33a7..93da9c3f3341 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -19,7 +19,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); -extern int avail_to_resrv_perfctr_nmi(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 898df9719afb..74f4e85a5727 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) return !test_bit(counter, perfctr_nmi_owner); } - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); int reserve_perfctr_nmi(unsigned int msr) -- cgit v1.2.2 From d015a092989d673df44a5ad6866dc5d5006b7a2a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 28 Dec 2009 10:26:59 +0200 Subject: x86: Use KERN_DEFAULT log-level in __show_regs() Andrew Morton reported a strange looking kmemcheck warning: WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88004fba6c20) 0000000000000000310000000000000000000000000000002413000000c9ffff u u u u u u u u u u u u u u u u i i i i i i i i u u u u u u u u [] kmemleak_scan+0x25a/0x540 [] kmemleak_scan_thread+0x5b/0xe0 [] kthread+0x9e/0xb0 [] kernel_thread_helper+0x4/0x10 [] 0xffffffffffffffff The above printout is missing register dump completely. The problem here is that the output comes from syslog which doesn't show KERN_INFO log-level messages. We didn't see this before because both of us were testing on 32-bit kernels which use the _default_ log-level. Fix that up by explicitly using KERN_DEFAULT log-level for __show_regs() printks. Signed-off-by: Pekka Enberg Cc: Vegard Nossum Cc: Andrew Morton Cc: Arjan van de Ven Cc: Linus Torvalds LKML-Reference: <1261988819.4641.2.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/process_32.c | 14 +++++++------- arch/x86/kernel/process_64.c | 24 ++++++++++++------------ 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 98c2cdeb599e..c6ee241c8a98 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -103,8 +103,8 @@ void show_regs_common(void) if (!product) product = ""; - printk("\n"); - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9c517b5858f0..37ad1e046aae 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -139,16 +139,16 @@ void __show_regs(struct pt_regs *regs, int all) show_regs_common(); - printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); print_symbol("EIP is at %s\n", regs->ip); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); if (!all) @@ -158,19 +158,19 @@ void __show_regs(struct pt_regs *regs, int all) cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); - printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", d6, d7); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 52fbd0c60198..f9e033150cdf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,19 +161,19 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int ds, cs, es; show_regs_common(); - printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); - printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); - printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); @@ -194,21 +194,21 @@ void __show_regs(struct pt_regs *regs, int all) cr3 = read_cr3(); cr4 = read_cr4(); - printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) -- cgit v1.2.2 From c0ca9da442df82b67095f230f24762042f9f3b7d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 28 Dec 2009 11:02:15 +0200 Subject: x86, kmemcheck: Use KERN_WARNING for error reporting As suggested by Vegard Nossum, use KERN_WARNING for error reporting to make sure kmemcheck reports end up in syslog. Suggested-by: Vegard Nossum Signed-off-by: Pekka Enberg Cc: Andrew Morton LKML-Reference: <1261990935.4641.7.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmemcheck/error.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index 4901d0dafda6..af3b6c8a436f 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -106,26 +106,25 @@ void kmemcheck_error_recall(void) switch (e->type) { case KMEMCHECK_ERROR_INVALID_ACCESS: - printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " - "from %s memory (%p)\n", + printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", 8 * e->size, e->state < ARRAY_SIZE(desc) ? desc[e->state] : "(invalid shadow state)", (void *) e->address); - printk(KERN_INFO); + printk(KERN_WARNING); for (i = 0; i < SHADOW_COPY_SIZE; ++i) - printk("%02x", e->memory_copy[i]); - printk("\n"); + printk(KERN_CONT "%02x", e->memory_copy[i]); + printk(KERN_CONT "\n"); - printk(KERN_INFO); + printk(KERN_WARNING); for (i = 0; i < SHADOW_COPY_SIZE; ++i) { if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) - printk(" %c", short_desc[e->shadow_copy[i]]); + printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); else - printk(" ?"); + printk(KERN_CONT " ?"); } - printk("\n"); - printk(KERN_INFO "%*c\n", 2 + 2 + printk(KERN_CONT "\n"); + printk(KERN_WARNING "%*c\n", 2 + 2 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); break; case KMEMCHECK_ERROR_BUG: -- cgit v1.2.2 From 39d30770992895d55789de64bad2349510af68d0 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 28 Dec 2009 13:28:25 -0800 Subject: x86: SGI UV: Fix writes to led registers on remote uv hubs The wrong address was being used to write the SCIR led regs on remote hubs. Also, there was an inconsistency between how BIOS and the kernel indexed these regs. Standardize on using the lower 6 bits of the APIC ID as the index. This patch fixes the problem of writing to an errant address to a cpu # >= 64. Signed-off-by: Mike Travis Reviewed-by: Jack Steiner Cc: Robin Holt Cc: Linus Torvalds Cc: stable@kernel.org LKML-Reference: <4B3922F9.3060905@sgi.com> [ v2: fix a number of annoying checkpatch artifacts and whitespace noise ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 86 +++++++++++++++++++++----------------- arch/x86/kernel/apic/x2apic_uv_x.c | 12 +++--- 2 files changed, 54 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 811bfabc80b7..bc54fa965af3 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -31,20 +31,20 @@ * contiguous (although various IO spaces may punch holes in * it).. * - * N - Number of bits in the node portion of a socket physical - * address. + * N - Number of bits in the node portion of a socket physical + * address. * - * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of - * routers always have low bit of 1, C/MBricks have low bit - * equal to 0. Most addressing macros that target UV hub chips - * right shift the NASID by 1 to exclude the always-zero bit. - * NASIDs contain up to 15 bits. + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. * * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead * of nasids. * - * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant - * of the nasid for socket usage. + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. * * * NumaLink Global Physical Address Format: @@ -71,12 +71,12 @@ * * * APICID format - * NOTE!!!!!! This is the current format of the APICID. However, code - * should assume that this will change in the future. Use functions - * in this file for all APICID bit manipulations and conversion. + * NOTE!!!!!! This is the current format of the APICID. However, code + * should assume that this will change in the future. Use functions + * in this file for all APICID bit manipulations and conversion. * - * 1111110000000000 - * 5432109876543210 + * 1111110000000000 + * 5432109876543210 * pppppppppplc0cch * sssssssssss * @@ -89,9 +89,9 @@ * Note: Processor only supports 12 bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. * - * Unless otherwise specified, all references to APICID refer to - * the FULL value contained in ACPI tables, not the subset in the - * processor APICID register. + * Unless otherwise specified, all references to APICID refer to + * the FULL value contained in ACPI tables, not the subset in the + * processor APICID register. */ @@ -151,16 +151,16 @@ struct uv_hub_info_s { }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) +#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) /* * Local & Global MMR space macros. - * Note: macros are intended to be used ONLY by inline functions - * in this file - not by other kernel code. - * n - NASID (full 15-bit global nasid) - * g - GNODE (full 15-bit global nasid, right shifted 1) - * p - PNODE (local part of nsids, right shifted 1) + * Note: macros are intended to be used ONLY by inline functions + * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) @@ -215,8 +215,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. - * Note: use the standard __pa() & __va() macros for converting - * between socket virtual and socket physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. */ /* socket phys RAM --> UV global physical address */ @@ -287,21 +287,18 @@ static inline int uv_apicid_to_pnode(int apicid) * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ -static inline unsigned long *uv_global_mmr32_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR32_BASE | UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr32(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr32_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr32(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { return readq(uv_global_mmr32_address(pnode, offset)); } @@ -310,21 +307,18 @@ static inline unsigned long uv_read_global_mmr32(int pnode, * Access Global MMR space using the MMR space located at the top of physical * memory. */ -static inline unsigned long *uv_global_mmr64_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR64_BASE | UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr64(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr64_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr64(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { return readq(uv_global_mmr64_address(pnode, offset)); } @@ -338,6 +332,16 @@ static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long o return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val); } +static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) +{ + writeb(val, uv_global_mmr64_address(pnode, offset)); +} + +static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset) +{ + return readb(uv_global_mmr64_address(pnode, offset)); +} + /* * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. @@ -457,11 +461,17 @@ static inline void uv_set_scir_bits(unsigned char value) } } +static inline unsigned long uv_scir_offset(int apicid) +{ + return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f); +} + static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_write_global_mmr8(uv_cpu_to_pnode(cpu), + uv_cpu_hub_info(cpu)->scir.offset, value); uv_cpu_hub_info(cpu)->scir.state = value; - uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); } } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d56b0efb2057..5f92494dab61 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -629,8 +629,10 @@ void __init uv_system_init(void) uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -651,15 +653,13 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", + cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ -- cgit v1.2.2 From 9959c888a38b0f25b0e81a480f537d6489348442 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 28 Dec 2009 21:08:29 -0800 Subject: x86: Increase NR_IRQS and nr_irqs I have a system with lots of igb and ixgbe, when iov/vf are enabled for them, we hit the limit of 3064. when system has 20 pcie installed, and one card has 2 functions, and one function needs 64 msi-x, may need 20 * 2 * 64 = 2560 for msi-x but if iov and vf are enabled may need 20 * 2 * 64 * 3 = 7680 for msi-x assume system with 5 ioapic, nr_irqs_gsi will be 120. NR_CPUS = 512, and nr_cpu_ids = 128 will have NR_IRQS = 256 + 512 * 64 = 33024 will have nr_irqs = 120 + 8 * 128 + 120 * 64 = 8824 When SPARSE_IRQ is not set, there is no increase with kernel data size. when NR_CPUS=128, and SPARSE_IRQ is set: text data bss dec hex filename 21837444 4216564 12480736 38534744 24bfe58 vmlinux.before 21837442 4216580 12480736 38534758 24bfe66 vmlinux.after when NR_CPUS=4096, and SPARSE_IRQ is set text data bss dec hex filename 21878619 5610244 13415392 40904255 270263f vmlinux.before 21878617 5610244 13415392 40904253 270263d vmlinux.after Signed-off-by: Yinghai Lu Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4B398ECD.1080506@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 12 ++++++------ arch/x86/kernel/apic/io_apic.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4611f085cd43..3ab43df089cd 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -154,21 +154,21 @@ static inline int invalid_vm86_irq(int irq) #define NR_IRQS_LEGACY 16 -#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) #define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) #ifdef CONFIG_X86_IO_APIC # ifdef CONFIG_SPARSE_IRQ +# define CPU_VECTOR_LIMIT (64 * NR_CPUS) # define NR_IRQS \ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) # else -# if NR_CPUS < MAX_IO_APICS -# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) -# else -# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) -# endif +# define CPU_VECTOR_LIMIT (32 * NR_CPUS) +# define NR_IRQS \ + (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) # endif #else /* !CONFIG_X86_IO_APIC: */ # define NR_IRQS NR_IRQS_LEGACY diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index de00c4619a55..d9cd1f1b9c07 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3840,7 +3840,7 @@ int __init arch_probe_nr_irqs(void) /* * for MSI and HT dyn irq */ - nr += nr_irqs_gsi * 16; + nr += nr_irqs_gsi * 64; #endif if (nr < nr_irqs) nr_irqs = nr; -- cgit v1.2.2 From 499a5f1efa0b0ac56ec5d060412aed84ae68e63e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Dec 2009 16:05:51 +0000 Subject: x86: Lift restriction on the location of FIX_BTMAP_* The early ioremap fixmap entries cover half (or for 32-bit non-PAE, a quarter) of a page table, yet they got uncondtitionally aligned so far to a 256-entry boundary. This is not necessary if the range of page table entries anyway falls into a single page table. This buys back, for (theoretically) 50% of all configurations (25% of all non-PAE ones), at least some of the lowmem necessarily lost with commit e621bd18958ef5dbace3129ebe17a0a475e127d9. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4B2BB66F0200007800026AD6@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 16 +++++++++++----- arch/x86/mm/ioremap.c | 4 ++++ 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 14f9890eb495..635f03bb4995 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -118,14 +118,20 @@ enum fixed_addresses { * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. * - * We round it up to the next 256 pages boundary so that we - * can have a single pgd entry and a single pte table: + * If necessary we round it up to the next 256 pages boundary so + * that we can have a single pgd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 #define FIX_BTMAPS_SLOTS 4 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - - (__end_of_permanent_fixed_addresses & 255), - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) + FIX_BTMAP_END = + (__end_of_permanent_fixed_addresses ^ + (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) & + -PTRS_PER_PTE + ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - + (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1)) + : __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c246d259822d..03c75ffd5c2a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -422,6 +422,10 @@ void __init early_ioremap_init(void) * The boot-ioremap range spans multiple pmds, for which * we are not prepared: */ +#define __FIXADDR_TOP (-PAGE_SIZE) + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); +#undef __FIXADDR_TOP if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { WARN_ON(1); printk(KERN_WARNING "pmd %p != %p\n", -- cgit v1.2.2 From 1b1d9258181bae199dc940f4bd0298126b9a73d9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Dec 2009 16:12:56 +0000 Subject: x86-64: Modify copy_user_generic() alternatives mechanism In order to avoid unnecessary chains of branches, rather than implementing copy_user_generic() as a function consisting of just a single (possibly patched) branch, instead properly deal with patching call instructions in the alternative instructions framework, and move the patching into the callers. As a follow-on, one could also introduce something like __EXPORT_SYMBOL_ALT() to avoid patching call sites in modules. Signed-off-by: Jan Beulich Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4B2BB8180200007800026AE7@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 7 ++++++- arch/x86/include/asm/uaccess_64.h | 21 ++++++++++++++++++++- arch/x86/kernel/alternative.c | 4 +++- arch/x86/kernel/x8664_ksyms_64.c | 3 ++- arch/x86/lib/copy_user_64.S | 6 ------ 5 files changed, 31 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 69b74a7b877f..3b5b828767b6 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -125,11 +125,16 @@ static inline void alternatives_smp_switch(int smp) {} asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : output : "i" (0), ## input) +/* Like alternative_io, but for replacing a direct call with another one. */ +#define alternative_call(oldfunc, newfunc, feature, output, input...) \ + asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ + : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) + /* * use this macro(s) if you need more than one output parameter * in alternative_io */ -#define ASM_OUTPUT2(a, b) a, b +#define ASM_OUTPUT2(a...) a struct paravirt_patch_site; #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 46324c6a4f6e..a78c40305447 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include /* @@ -16,7 +18,24 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long -copy_user_generic(void *to, const void *from, unsigned len); +copy_user_generic_string(void *to, const void *from, unsigned len); +__must_check unsigned long +copy_user_generic_unrolled(void *to, const void *from, unsigned len); + +static __always_inline __must_check unsigned long +copy_user_generic(void *to, const void *from, unsigned len) +{ + unsigned ret; + + alternative_call(copy_user_generic_unrolled, + copy_user_generic_string, + X86_FEATURE_REP_GOOD, + ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), + "=d" (len)), + "1" (to), "2" (from), "3" (len) + : "memory", "rcx", "r8", "r9", "r10", "r11"); + return ret; +} __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned len); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de7353c0ce9c..2589ea4c60ce 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -205,7 +205,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; - char insnbuf[MAX_PATCH_LEN]; + u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); for (a = start; a < end; a++) { @@ -223,6 +223,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start, } #endif memcpy(insnbuf, a->replacement, a->replacementlen); + if (*insnbuf == 0xe8 && a->replacementlen == 5) + *(s32 *)(insnbuf + 1) += a->replacement - a->instr; add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); text_poke_early(instr, insnbuf, a->instrlen); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 619f7f88b8cc..693920b22496 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -26,7 +26,8 @@ EXPORT_SYMBOL(__put_user_2); EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); -EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(copy_user_generic_string); +EXPORT_SYMBOL(copy_user_generic_unrolled); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index cf889d4e076a..71100c98e337 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -90,12 +90,6 @@ ENTRY(_copy_from_user) CFI_ENDPROC ENDPROC(_copy_from_user) -ENTRY(copy_user_generic) - CFI_STARTPROC - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(copy_user_generic) - .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) -- cgit v1.2.2 From 7269e8812a59f74fb1ce134465d0bcf5683b93a1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Dec 2009 16:16:03 +0000 Subject: x86-64: Modify memcpy()/memset() alternatives mechanism In order to avoid unnecessary chains of branches, rather than implementing memcpy()/memset()'s access to their alternative implementations via a jump, patch the (larger) original function directly. The memcpy() part of this is slightly subtle: while alternative instruction patching does itself use memcpy(), with the replacement block being less than 64-bytes in size the main loop of the original function doesn't get used for copying memcpy_c() over memcpy(), and hence we can safely write over its beginning. Also note that the CFI annotations are fine for both variants of each of the functions. Signed-off-by: Jan Beulich Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4B2BB8D30200007800026AF2@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/memcpy_64.S | 23 ++++++++--------------- arch/x86/lib/memset_64.S | 18 ++++++------------ 2 files changed, 14 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index ad5441ed1b57..f82e884928af 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -20,12 +20,11 @@ /* * memcpy_c() - fast string ops (REP MOVSQ) based variant. * - * Calls to this get patched into the kernel image via the + * This gets patched over the unrolled variant (below) via the * alternative instructions framework: */ - ALIGN -memcpy_c: - CFI_STARTPROC + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c: movq %rdi, %rax movl %edx, %ecx @@ -35,8 +34,8 @@ memcpy_c: movl %edx, %ecx rep movsb ret - CFI_ENDPROC -ENDPROC(memcpy_c) +.Lmemcpy_e: + .previous ENTRY(__memcpy) ENTRY(memcpy) @@ -128,16 +127,10 @@ ENDPROC(__memcpy) * It is also a lot simpler. Use this when possible: */ - .section .altinstr_replacement, "ax" -1: .byte 0xeb /* jmp */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions, "a" .align 8 .quad memcpy - .quad 1b + .quad .Lmemcpy_c .byte X86_FEATURE_REP_GOOD /* @@ -145,6 +138,6 @@ ENDPROC(__memcpy) * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ - .byte 2b - 1b - .byte 2b - 1b + .byte .Lmemcpy_e - .Lmemcpy_c + .byte .Lmemcpy_e - .Lmemcpy_c .previous diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 2c5948116bd2..e88d3b81644a 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -12,9 +12,8 @@ * * rax original destination */ - ALIGN -memset_c: - CFI_STARTPROC + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c: movq %rdi,%r9 movl %edx,%r8d andl $7,%r8d @@ -29,8 +28,8 @@ memset_c: rep stosb movq %r9,%rax ret - CFI_ENDPROC -ENDPROC(memset_c) +.Lmemset_e: + .previous ENTRY(memset) ENTRY(__memset) @@ -118,16 +117,11 @@ ENDPROC(__memset) #include - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (memset_c - memset) - (2f - 1b) /* offset */ -2: - .previous .section .altinstructions,"a" .align 8 .quad memset - .quad 1b + .quad .Lmemset_c .byte X86_FEATURE_REP_GOOD .byte .Lfinal - memset - .byte 2b - 1b + .byte .Lmemset_e - .Lmemset_c .previous -- cgit v1.2.2 From 9a7262a0563da6b91019156abf487bcdf1a41526 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 28 Dec 2009 13:28:25 -0800 Subject: x86_64 SGI UV: Fix writes to led registers on remote uv hubs. The wrong address was being used to write the SCIR led regs on remote hubs. Also, there was an inconsistency between how BIOS and the kernel indexed these regs. Standardize on using the lower 6 bits of the APIC ID as the index. This patch fixes the problem of writing to an errant address to a cpu # >= 64. Signed-off-by: Mike Travis Reviewed-by: Jack Steiner Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uv/uv_hub.h | 20 +++++++++++++++++++- arch/x86/kernel/apic/x2apic_uv_x.c | 12 ++++++------ 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 811bfabc80b7..bcdb708993d2 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -338,6 +338,18 @@ static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long o return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val); } +static inline void uv_write_global_mmr8(int pnode, unsigned long offset, + unsigned char val) +{ + writeb(val, uv_global_mmr64_address(pnode, offset)); +} + +static inline unsigned char uv_read_global_mmr8(int pnode, + unsigned long offset) +{ + return readb(uv_global_mmr64_address(pnode, offset)); +} + /* * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. @@ -457,11 +469,17 @@ static inline void uv_set_scir_bits(unsigned char value) } } +static inline unsigned long uv_scir_offset(int apicid) +{ + return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f); +} + static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_write_global_mmr8(uv_cpu_to_pnode(cpu), + uv_cpu_hub_info(cpu)->scir.offset, value); uv_cpu_hub_info(cpu)->scir.state = value; - uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); } } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d56b0efb2057..5f92494dab61 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -629,8 +629,10 @@ void __init uv_system_init(void) uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -651,15 +653,13 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", + cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ -- cgit v1.2.2 From d7f0eea9e431e1b8b0742a74db1a9490730b2a25 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 30 Dec 2009 15:36:42 +0800 Subject: ACPI: introduce kernel parameter acpi_sleep=sci_force_enable Introduce kernel parameter acpi_sleep=sci_force_enable some laptop requires SCI_EN being set directly on resume, or else they hung somewhere in the resume code path. We already have a blacklist for these laptops but we still need this option, especially when debugging some suspend/resume problems, in case there are systems that need this workaround and are not yet in the blacklist. Signed-off-by: Zhang Rui Acked-by: Rafael J. Wysocki Signed-off-by: Len Brown --- arch/x86/kernel/acpi/sleep.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b91..f9961034e557 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "sci_force_enable", 16) == 0) + acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); -- cgit v1.2.2 From 48b5ba9cc98d676712da29d9931f1c88e5185ff2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Dec 2009 05:53:02 +0100 Subject: perf: Pass appropriate frame pointer to dump_trace() Pass the frame pointer from the regs of the interrupted path to dump_trace() while processing the stack trace. Currently, dump_trace() takes the current bp and starts the callchain from dump_trace() itself. This is wasteful because we need to walk through the entire NMI/DEBUG stack before retrieving the interrupted point. We can fix that by just using the frame pointer from the captured regs. It points exactly where we want to start. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1262235183-5320-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c223b7e895d9..d616c06e99b4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2347,7 +2347,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_KERNEL); callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } /* -- cgit v1.2.2 From 9dad0fd5a73d4048dff18069733c0b515f68df74 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 22 Dec 2009 15:40:39 -0800 Subject: x86: Fix size for ex trampoline with 32bit fix for error that is introduced by | x86: Use find_e820() instead of hard coded trampoline address it should end with PAGE_SIZE + PAGE_SIZE Signed-off-by: Yinghai Lu LKML-Reference: <1261525263-13763-2-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 05ed7ab2ca48..a1a7876cadcb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -733,13 +733,13 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 }, + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, #endif {} -- cgit v1.2.2 From a557aae29cf5916295c234d4b10ba3f8f29b8a96 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 22 Dec 2009 15:40:40 -0800 Subject: x86/pci: Intel ioh bus num reg accessing fix It is above 0x100 (PCI-Express extended register space), so if mmconf is not enable, we can't access it. [ hpa: changed the bound from 0x200 to 0x120, which is the tight bound. ] Reported-by: Jens Axboe Signed-off-by: Yinghai Lu LKML-Reference: <1261525263-13763-3-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/pci/intel_bus.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c index b7a55dc55d13..f81a2fa8fe25 100644 --- a/arch/x86/pci/intel_bus.c +++ b/arch/x86/pci/intel_bus.c @@ -49,6 +49,10 @@ static void __devinit pci_root_bus_res(struct pci_dev *dev) u64 mmioh_base, mmioh_end; int bus_base, bus_end; + /* some sys doesn't get mmconf enabled */ + if (dev->cfg_size < 0x120) + return; + if (pci_root_num >= PCI_ROOT_NR) { printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n"); return; -- cgit v1.2.2 From f4b825bde98938f160315d655597bc9731521cae Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 5 Jan 2010 12:48:49 +1030 Subject: Revert "x86: Side-step lguest problem by only building cmpxchg8b_emu for pre-Pentium" This reverts commit ae1b22f6e46c03cede7cea234d0bf2253b4261cf. As Linus said in 982d007a6ee: "There was something really messy about cmpxchg8b and clone CPU's, so if you enable it on other CPUs later, do it carefully." This breaks lguest for those configs, but we can fix that by emulating if we have to. Fixes: http://bugzilla.kernel.org/show_bug.cgi?id=14884 Signed-off-by: Rusty Russell LKML-Reference: <201001051248.49700.rusty@rustcorp.com.au> Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 08e442bc3ab9..f20ddf84a893 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -396,7 +396,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on !M386 && !M486 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM # this should be set for all -march=.. options where the compiler # generates cmov. -- cgit v1.2.2 From 99d113b17e8ca5a8b68a9d3f7691e2f552dd6a06 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Jan 2010 16:16:06 -0800 Subject: x86, apic: Reclaim IDT vectors 0x20-0x2f Reclaim 16 IDT vectors and make them available for general allocation. Reclaim vectors 0x20-0x2f by reallocating the IRQ_MOVE_CLEANUP_VECTOR to vector 0x1f. This is in the range of vector numbers that is officially reserved for the CPU (for exceptions), however, the use of the APIC to generate any vector 0x10 or above is documented, and the CPU internally can receive any vector number (the legacy BIOS uses INT 0x08-0x0f for interrupts, as messed up as that is.) Since IRQ_MOVE_CLEANUP_VECTOR has to be alone in the lowest-numbered priority level (block of 16), this effectively enables us to reclaim an otherwise-unusable APIC priority level and put it to use. Since this is a transient kernel-only allocation we can change it at any time, and if/when there is an exception at vector 0x1f this assignment needs to be changed as part of OS enabling that new feature. Signed-off-by: Yinghai Lu LKML-Reference: <4B4284C6.9030107@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_vectors.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3ab43df089cd..dbc81acb7e93 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -30,26 +30,38 @@ /* * IDT vectors usable for external interrupt sources start * at 0x20: + * hpa said we can start from 0x1f. + * 0x1f is documented as reserved. However, the ability for the APIC + * to generate vectors starting at 0x10 is documented, as is the + * ability for the CPU to receive any vector number as an interrupt. + * 0x1f is used for IRQ_MOVE_CLEANUP_VECTOR since that vector needs + * an entire privilege level (16 vectors) all by itself at a higher + * priority than any actual device vector. Thus, by placing it in the + * otherwise-unusable 0x10 privilege level, we avoid wasting a full + * 16-vector block. */ -#define FIRST_EXTERNAL_VECTOR 0x20 +#define FIRST_EXTERNAL_VECTOR 0x1f +#define IA32_SYSCALL_VECTOR 0x80 #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 -# define IA32_SYSCALL_VECTOR 0x80 -#else -# define IA32_SYSCALL_VECTOR 0x80 #endif /* - * Reserve the lowest usable priority level 0x20 - 0x2f for triggering + * Reserve the lowest usable priority level 0x10 - 0x1f for triggering * cleanup after irq migration. + * this overlaps with the reserved range for cpu exceptions so this + * will need to be changed to 0x20 - 0x2f if the last cpu exception is + * ever allocated. */ + #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR /* - * Vectors 0x30-0x3f are used for ISA interrupts. + * Vectors 0x20-0x2f are used for ISA interrupts. + * round up to the next 16-vector boundary */ -#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) #define IRQ1_VECTOR (IRQ0_VECTOR + 1) #define IRQ2_VECTOR (IRQ0_VECTOR + 2) @@ -122,7 +134,7 @@ /* * First APIC vector available to drivers: (vectors 0x30-0xee) we - * start at 0x31(0x41) to spread out vectors evenly between priority + * start at 0x31 to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) */ #define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) -- cgit v1.2.2 From ea94396629a3e0cb9a3a9c75335b1de255b30426 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Jan 2010 21:14:41 -0800 Subject: x86, apic: Don't waste a vector to improve vector spread We want to use a vector-assignment sequence that avoids stumbling onto 0x80 earlier in the sequence, in order to improve the spread of vectors across priority levels on machines with a small number of interrupt sources. Right now, this is done by simply making the first vector (0x31 or 0x41) completely unusable. This is unnecessary; all we need is to start assignment at a +1 offset, we don't actually need to prohibit the usage of this vector once we have wrapped around. Signed-off-by: H. Peter Anvin LKML-Reference: <4B426550.6000209@kernel.org> --- arch/x86/include/asm/irq_vectors.h | 9 +++++---- arch/x86/kernel/apic/io_apic.c | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index dbc81acb7e93..585a42810cf8 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -133,11 +133,12 @@ #define MCE_SELF_VECTOR 0xeb /* - * First APIC vector available to drivers: (vectors 0x30-0xee) we - * start at 0x31 to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) + * First APIC vector available to drivers: (vectors 0x30-0xee). We + * start allocating at 0x31 to spread out vectors evenly between + * priority levels. (0x80 is the syscall vector) */ -#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) +#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 1) +#define VECTOR_OFFSET_START 1 #define NR_VECTORS 256 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d9cd1f1b9c07..e9ba0903e9d5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1162,7 +1162,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + static int current_vector = FIRST_DEVICE_VECTOR + VECTOR_OFFSET_START; + static int current_offset = VECTOR_OFFSET_START % 8; unsigned int old_vector; int cpu, err; cpumask_var_t tmp_mask; -- cgit v1.2.2 From 38b7827fcdd660f591d645bd3ae6644456a4773c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:49 +0900 Subject: local_t: Remove cpu_local_xx macros These macros have not been used for awhile now. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/local.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 47b9b6f19057..2e9972468a5d 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -195,41 +195,4 @@ static inline long local_sub_return(long i, local_t *l) #define __local_add(i, l) local_add((i), (l)) #define __local_sub(i, l) local_sub((i), (l)) -/* Use these for per-cpu local_t variables: on some archs they are - * much more efficient than these naive implementations. Note they take - * a variable, not an address. - * - * X86_64: This could be done better if we moved the per cpu data directly - * after GS. - */ - -/* Need to disable preemption for the cpu local counters otherwise we could - still access a variable of a previous CPU in a non atomic way. */ -#define cpu_local_wrap_v(l) \ -({ \ - local_t res__; \ - preempt_disable(); \ - res__ = (l); \ - preempt_enable(); \ - res__; \ -}) -#define cpu_local_wrap(l) \ -({ \ - preempt_disable(); \ - (l); \ - preempt_enable(); \ -}) \ - -#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l)))) -#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i))) -#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l)))) -#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l)))) -#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l)))) -#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l)))) - -#define __cpu_local_inc(l) cpu_local_inc((l)) -#define __cpu_local_dec(l) cpu_local_dec((l)) -#define __cpu_local_add(i, l) cpu_local_add((i), (l)) -#define __cpu_local_sub(i, l) cpu_local_sub((i), (l)) - #endif /* _ASM_X86_LOCAL_H */ -- cgit v1.2.2 From 5917dae83cb02dfe74c9167b79e86e6d65183fa3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: percpu, x86: Generic inc / dec percpu instructions Optimize code generated for percpu access by checking for increment and decrements. tj: fix incorrect usage of __builtin_constant_p() and restructure percpu_add_op() macro. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/percpu.h | 86 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 4c170ccc72ed..66a272dfd8b8 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -103,6 +103,64 @@ do { \ } \ } while (0) +/* + * Generate a percpu add to memory instruction and optimize code + * if a one is added or subtracted. + */ +#define percpu_add_op(var, val) \ +do { \ + typedef typeof(var) pao_T__; \ + const int pao_ID__ = (__builtin_constant_p(val) && \ + ((val) == 1 || (val) == -1)) ? (val) : 0; \ + if (0) { \ + pao_T__ pao_tmp__; \ + pao_tmp__ = (val); \ + } \ + switch (sizeof(var)) { \ + case 1: \ + if (pao_ID__ == 1) \ + asm("incb "__percpu_arg(0) : "+m" (var)); \ + else if (pao_ID__ == -1) \ + asm("decb "__percpu_arg(0) : "+m" (var)); \ + else \ + asm("addb %1, "__percpu_arg(0) \ + : "+m" (var) \ + : "qi" ((pao_T__)(val))); \ + break; \ + case 2: \ + if (pao_ID__ == 1) \ + asm("incw "__percpu_arg(0) : "+m" (var)); \ + else if (pao_ID__ == -1) \ + asm("decw "__percpu_arg(0) : "+m" (var)); \ + else \ + asm("addw %1, "__percpu_arg(0) \ + : "+m" (var) \ + : "ri" ((pao_T__)(val))); \ + break; \ + case 4: \ + if (pao_ID__ == 1) \ + asm("incl "__percpu_arg(0) : "+m" (var)); \ + else if (pao_ID__ == -1) \ + asm("decl "__percpu_arg(0) : "+m" (var)); \ + else \ + asm("addl %1, "__percpu_arg(0) \ + : "+m" (var) \ + : "ri" ((pao_T__)(val))); \ + break; \ + case 8: \ + if (pao_ID__ == 1) \ + asm("incq "__percpu_arg(0) : "+m" (var)); \ + else if (pao_ID__ == -1) \ + asm("decq "__percpu_arg(0) : "+m" (var)); \ + else \ + asm("addq %1, "__percpu_arg(0) \ + : "+m" (var) \ + : "re" ((pao_T__)(val))); \ + break; \ + default: __bad_percpu_size(); \ + } \ +} while (0) + #define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) pfo_ret__; \ @@ -144,8 +202,8 @@ do { \ #define percpu_read(var) percpu_from_op("mov", var, "m" (var)) #define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) #define percpu_write(var, val) percpu_to_op("mov", var, val) -#define percpu_add(var, val) percpu_to_op("add", var, val) -#define percpu_sub(var, val) percpu_to_op("sub", var, val) +#define percpu_add(var, val) percpu_add_op(var, val) +#define percpu_sub(var, val) percpu_add_op(var, -(val)) #define percpu_and(var, val) percpu_to_op("and", var, val) #define percpu_or(var, val) percpu_to_op("or", var, val) #define percpu_xor(var, val) percpu_to_op("xor", var, val) @@ -157,9 +215,9 @@ do { \ #define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) #define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) -#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) -#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) #define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) #define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) #define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) @@ -176,9 +234,9 @@ do { \ #define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) -#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) -#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) #define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) #define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) #define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) @@ -189,9 +247,9 @@ do { \ #define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) -#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) -#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val) #define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) #define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) #define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) @@ -209,19 +267,19 @@ do { \ #ifdef CONFIG_X86_64 #define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) -- cgit v1.2.2 From 409d02ef6d74f5e91f5ea4c587b2ee1375f106fc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Jan 2010 14:19:11 +0100 Subject: x86: copy_from_user() should not return -EFAULT Callers of copy_from_user() expect it to return the number of bytes it could not copy. In no case it is supposed to return -EFAULT. In case of a detected buffer overflow just return the requested length. In addition one could think of a memset that would clear the size of the target object. [ hpa: code is not in .32 so not needed for -stable ] Signed-off-by: Heiko Carstens Acked-by: Arjan van de Ven LKML-Reference: <20100105131911.GC5480@osiris.boeblingen.de.ibm.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess_32.h | 5 ++--- arch/x86/include/asm/uaccess_64.h | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 0c9825e97f36..088d09fb1615 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -205,14 +205,13 @@ static inline unsigned long __must_check copy_from_user(void *to, unsigned long n) { int sz = __compiletime_object_size(to); - int ret = -EFAULT; if (likely(sz == -1 || sz >= n)) - ret = _copy_from_user(to, from, n); + n = _copy_from_user(to, from, n); else copy_from_user_overflow(); - return ret; + return n; } long __must_check strncpy_from_user(char *dst, const char __user *src, diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 46324c6a4f6e..535e421498f6 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -30,16 +30,15 @@ static inline unsigned long __must_check copy_from_user(void *to, unsigned long n) { int sz = __compiletime_object_size(to); - int ret = -EFAULT; might_fault(); if (likely(sz == -1 || sz >= n)) - ret = _copy_from_user(to, from, n); + n = _copy_from_user(to, from, n); #ifdef CONFIG_DEBUG_VM else WARN(1, "Buffer overflow detected!\n"); #endif - return ret; + return n; } static __always_inline __must_check -- cgit v1.2.2 From db677ffa5f5a4f15b9dad4d132b3477b80766d82 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 5 Jan 2010 12:48:49 +1030 Subject: Revert "x86: Side-step lguest problem by only building cmpxchg8b_emu for pre-Pentium" This reverts commit ae1b22f6e46c03cede7cea234d0bf2253b4261cf. As Linus said in 982d007a6ee: "There was something really messy about cmpxchg8b and clone CPU's, so if you enable it on other CPUs later, do it carefully." This breaks lguest for those configs, but we can fix that by emulating if we have to. Fixes: http://bugzilla.kernel.org/show_bug.cgi?id=14884 Signed-off-by: Rusty Russell Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 08e442bc3ab9..f20ddf84a893 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -396,7 +396,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on !M386 && !M486 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM # this should be set for all -march=.. options where the compiler # generates cmov. -- cgit v1.2.2 From 7f41c2e1523f628cc248e34192162aec5728bed7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 6 Jan 2010 10:56:31 -0800 Subject: x86, irq: Check move_in_progress before freeing the vector mapping With the recent irq migration fixes (post 2.6.32), Gary Hade has noticed "No IRQ handler for vector" messages during the 2.6.33-rc1 kernel boot on IBM AMD platforms and root caused the issue to this commit: > commit 23359a88e7eca3c4f402562b102f23014db3c2aa > Author: Suresh Siddha > Date: Mon Oct 26 14:24:33 2009 -0800 > > x86: Remove move_cleanup_count from irq_cfg As part of this patch, we have removed the move_cleanup_count check in smp_irq_move_cleanup_interrupt(). With this change, we can run into a situation where an irq cleanup interrupt on a cpu can cleanup the vector mappings associated with multiple irqs, of which one of the irq's migration might be still in progress. As such when that irq hits the old cpu, we get the "No IRQ handler" messages. Fix this by checking for the irq_cfg's move_in_progress and if the move is still in progress delay the vector cleanup to another irq cleanup interrupt request (which will happen when the irq starts arriving at the new cpu destination). Reported-and-tested-by: Gary Hade Signed-off-by: Suresh Siddha LKML-Reference: <1262804191.2732.7.camel@sbs-t61.sc.intel.com> Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index de00c4619a55..53243ca7816d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2434,6 +2434,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); raw_spin_lock(&desc->lock); + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) + goto unlock; + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; -- cgit v1.2.2 From 8558e3943df1c51c3377cb4e8a52ea484d6f357d Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Jan 2010 16:11:06 -0500 Subject: x86, ACPI: delete acpi_boot_table_init() return value cleanup only. setup_arch(), doesn't care care if ACPI initialization succeeded or failed, so delete acpi_boot_table_init()'s return value. Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb1035cd9a6a..036d28adf59d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1529,16 +1529,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * if acpi_blacklisted() acpi_disabled = 1; * acpi_irq_model=... * ... - * - * return value: (currently ignored) - * 0: success - * !0: failure */ -int __init acpi_boot_table_init(void) +void __init acpi_boot_table_init(void) { - int error; - dmi_check_system(acpi_dmi_table); /* @@ -1546,15 +1540,14 @@ int __init acpi_boot_table_init(void) * One exception: acpi=ht continues far enough to enumerate LAPICs */ if (acpi_disabled && !acpi_ht) - return 1; + return; /* * Initialize the ACPI boot-time table parser. */ - error = acpi_table_init(); - if (error) { + if (acpi_table_init()) { disable_acpi(); - return error; + return; } acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1562,18 +1555,15 @@ int __init acpi_boot_table_init(void) /* * blacklist may disable ACPI entirely */ - error = acpi_blacklisted(); - if (error) { + if (acpi_blacklisted()) { if (acpi_force) { printk(KERN_WARNING PREFIX "acpi=force override\n"); } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return error; + return; } } - - return 0; } int __init early_acpi_boot_init(void) -- cgit v1.2.2 From 1a3b1d89eded68d64e5ea409ad37827310059441 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 7 Jan 2010 11:53:33 -0500 Subject: x86: Split atomic64_t functions into seperate headers Split atomic64_t functions out into separate headers, since they will not be practical to merge between 32 and 64 bits. Signed-off-by: Brian Gerst LKML-Reference: <1262883215-4034-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/atomic64_32.h | 160 ++++++++++++++++++++++++++ arch/x86/include/asm/atomic64_64.h | 224 +++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/atomic_32.h | 152 +------------------------ arch/x86/include/asm/atomic_64.h | 217 +---------------------------------- 4 files changed, 386 insertions(+), 367 deletions(-) create mode 100644 arch/x86/include/asm/atomic64_32.h create mode 100644 arch/x86/include/asm/atomic64_64.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h new file mode 100644 index 000000000000..03027bf28de5 --- /dev/null +++ b/arch/x86/include/asm/atomic64_32.h @@ -0,0 +1,160 @@ +#ifndef _ASM_X86_ATOMIC64_32_H +#define _ASM_X86_ATOMIC64_32_H + +#include +#include +#include +//#include + +/* An 64bit atomic type */ + +typedef struct { + u64 __aligned(8) counter; +} atomic64_t; + +#define ATOMIC64_INIT(val) { (val) } + +extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); + +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ +extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +extern void atomic64_set(atomic64_t *ptr, u64 new_val); + +/** + * atomic64_read - read atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically reads the value of @ptr and returns it. + */ +static inline u64 atomic64_read(atomic64_t *ptr) +{ + u64 res; + + /* + * Note, we inline this atomic64_t primitive because + * it only clobbers EAX/EDX and leaves the others + * untouched. We also (somewhat subtly) rely on the + * fact that cmpxchg8b returns the current 64-bit value + * of the memory location we are touching: + */ + asm volatile( + "mov %%ebx, %%eax\n\t" + "mov %%ecx, %%edx\n\t" + LOCK_PREFIX "cmpxchg8b %1\n" + : "=&A" (res) + : "m" (*ptr) + ); + + return res; +} + +extern u64 atomic64_read(atomic64_t *ptr); + +/** + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); + +/* + * Other variants with different arithmetic operators: + */ +extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); +extern u64 atomic64_inc_return(atomic64_t *ptr); +extern u64 atomic64_dec_return(atomic64_t *ptr); + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +extern void atomic64_add(u64 delta, atomic64_t *ptr); + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +extern void atomic64_sub(u64 delta, atomic64_t *ptr); + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +extern void atomic64_inc(atomic64_t *ptr); + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +extern void atomic64_dec(atomic64_t *ptr); + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +extern int atomic64_dec_and_test(atomic64_t *ptr); + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +extern int atomic64_inc_and_test(atomic64_t *ptr); + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); + +#endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h new file mode 100644 index 000000000000..51c5b4056929 --- /dev/null +++ b/arch/x86/include/asm/atomic64_64.h @@ -0,0 +1,224 @@ +#ifndef _ASM_X86_ATOMIC64_64_H +#define _ASM_X86_ATOMIC64_64_H + +#include +#include +#include + +/* The 64-bit atomic type */ + +#define ATOMIC64_INIT(i) { (i) } + +/** + * atomic64_read - read atomic64 variable + * @v: pointer of type atomic64_t + * + * Atomically reads the value of @v. + * Doesn't imply a read memory barrier. + */ +static inline long atomic64_read(const atomic64_t *v) +{ + return v->counter; +} + +/** + * atomic64_set - set atomic64 variable + * @v: pointer to type atomic64_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +static inline void atomic64_set(atomic64_t *v, long i) +{ + v->counter = i; +} + +/** + * atomic64_add - add integer to atomic64 variable + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v. + */ +static inline void atomic64_add(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "addq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic64_sub(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "subq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_sub_and_test(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_inc - increment atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1. + */ +static inline void atomic64_inc(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "incq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic64_dec(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "decq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec_and_test - decrement and test + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic64_dec_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_inc_and_test - increment and test + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_inc_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic64_add_negative(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_add_return - add and return + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline long atomic64_add_return(long i, atomic64_t *v) +{ + long __i = i; + asm volatile(LOCK_PREFIX "xaddq %0, %1;" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +static inline long atomic64_sub_return(long i, atomic64_t *v) +{ + return atomic64_add_return(-i, v); +} + +#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) +#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) + +static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic64_xchg(atomic64_t *v, long new) +{ + return xchg(&v->counter, new); +} + +/** + * atomic64_add_unless - add unless the number is a given value + * @v: pointer of type atomic64_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic64_add_unless(atomic64_t *v, long a, long u) +{ + long c, old; + c = atomic64_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic64_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) + +#endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index dc5a667ff791..e128ae988cc9 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -260,156 +260,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -/* An 64bit atomic type */ - -typedef struct { - u64 __aligned(8) counter; -} atomic64_t; - -#define ATOMIC64_INIT(val) { (val) } - -extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); - -/** - * atomic64_xchg - xchg atomic64 variable - * @ptr: pointer to type atomic64_t - * @new_val: value to assign - * - * Atomically xchgs the value of @ptr to @new_val and returns - * the old value. - */ -extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); - -/** - * atomic64_set - set atomic64 variable - * @ptr: pointer to type atomic64_t - * @new_val: value to assign - * - * Atomically sets the value of @ptr to @new_val. - */ -extern void atomic64_set(atomic64_t *ptr, u64 new_val); - -/** - * atomic64_read - read atomic64 variable - * @ptr: pointer to type atomic64_t - * - * Atomically reads the value of @ptr and returns it. - */ -static inline u64 atomic64_read(atomic64_t *ptr) -{ - u64 res; - - /* - * Note, we inline this atomic64_t primitive because - * it only clobbers EAX/EDX and leaves the others - * untouched. We also (somewhat subtly) rely on the - * fact that cmpxchg8b returns the current 64-bit value - * of the memory location we are touching: - */ - asm volatile( - "mov %%ebx, %%eax\n\t" - "mov %%ecx, %%edx\n\t" - LOCK_PREFIX "cmpxchg8b %1\n" - : "=&A" (res) - : "m" (*ptr) - ); - - return res; -} - -extern u64 atomic64_read(atomic64_t *ptr); - -/** - * atomic64_add_return - add and return - * @delta: integer value to add - * @ptr: pointer to type atomic64_t - * - * Atomically adds @delta to @ptr and returns @delta + *@ptr - */ -extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); - -/* - * Other variants with different arithmetic operators: - */ -extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); -extern u64 atomic64_inc_return(atomic64_t *ptr); -extern u64 atomic64_dec_return(atomic64_t *ptr); - -/** - * atomic64_add - add integer to atomic64 variable - * @delta: integer value to add - * @ptr: pointer to type atomic64_t - * - * Atomically adds @delta to @ptr. - */ -extern void atomic64_add(u64 delta, atomic64_t *ptr); - -/** - * atomic64_sub - subtract the atomic64 variable - * @delta: integer value to subtract - * @ptr: pointer to type atomic64_t - * - * Atomically subtracts @delta from @ptr. - */ -extern void atomic64_sub(u64 delta, atomic64_t *ptr); - -/** - * atomic64_sub_and_test - subtract value from variable and test result - * @delta: integer value to subtract - * @ptr: pointer to type atomic64_t - * - * Atomically subtracts @delta from @ptr and returns - * true if the result is zero, or false for all - * other cases. - */ -extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); - -/** - * atomic64_inc - increment atomic64 variable - * @ptr: pointer to type atomic64_t - * - * Atomically increments @ptr by 1. - */ -extern void atomic64_inc(atomic64_t *ptr); - -/** - * atomic64_dec - decrement atomic64 variable - * @ptr: pointer to type atomic64_t - * - * Atomically decrements @ptr by 1. - */ -extern void atomic64_dec(atomic64_t *ptr); - -/** - * atomic64_dec_and_test - decrement and test - * @ptr: pointer to type atomic64_t - * - * Atomically decrements @ptr by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -extern int atomic64_dec_and_test(atomic64_t *ptr); - -/** - * atomic64_inc_and_test - increment and test - * @ptr: pointer to type atomic64_t - * - * Atomically increments @ptr by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -extern int atomic64_inc_and_test(atomic64_t *ptr); - -/** - * atomic64_add_negative - add and test if negative - * @delta: integer value to add - * @ptr: pointer to type atomic64_t - * - * Atomically adds @delta to @ptr and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); - +#include #include #endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index d605dc268e79..042c33100c69 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h @@ -187,196 +187,6 @@ static inline int atomic_sub_return(int i, atomic_t *v) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) -/* The 64-bit atomic type */ - -#define ATOMIC64_INIT(i) { (i) } - -/** - * atomic64_read - read atomic64 variable - * @v: pointer of type atomic64_t - * - * Atomically reads the value of @v. - * Doesn't imply a read memory barrier. - */ -static inline long atomic64_read(const atomic64_t *v) -{ - return v->counter; -} - -/** - * atomic64_set - set atomic64 variable - * @v: pointer to type atomic64_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ -static inline void atomic64_set(atomic64_t *v, long i) -{ - v->counter = i; -} - -/** - * atomic64_add - add integer to atomic64 variable - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v. - */ -static inline void atomic64_add(long i, atomic64_t *v) -{ - asm volatile(LOCK_PREFIX "addq %1,%0" - : "=m" (v->counter) - : "er" (i), "m" (v->counter)); -} - -/** - * atomic64_sub - subtract the atomic64 variable - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v. - */ -static inline void atomic64_sub(long i, atomic64_t *v) -{ - asm volatile(LOCK_PREFIX "subq %1,%0" - : "=m" (v->counter) - : "er" (i), "m" (v->counter)); -} - -/** - * atomic64_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline int atomic64_sub_and_test(long i, atomic64_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; -} - -/** - * atomic64_inc - increment atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1. - */ -static inline void atomic64_inc(atomic64_t *v) -{ - asm volatile(LOCK_PREFIX "incq %0" - : "=m" (v->counter) - : "m" (v->counter)); -} - -/** - * atomic64_dec - decrement atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1. - */ -static inline void atomic64_dec(atomic64_t *v) -{ - asm volatile(LOCK_PREFIX "decq %0" - : "=m" (v->counter) - : "m" (v->counter)); -} - -/** - * atomic64_dec_and_test - decrement and test - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline int atomic64_dec_and_test(atomic64_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "decq %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -} - -/** - * atomic64_inc_and_test - increment and test - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline int atomic64_inc_and_test(atomic64_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "incq %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -} - -/** - * atomic64_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline int atomic64_add_negative(long i, atomic64_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; -} - -/** - * atomic64_add_return - add and return - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns @i + @v - */ -static inline long atomic64_add_return(long i, atomic64_t *v) -{ - long __i = i; - asm volatile(LOCK_PREFIX "xaddq %0, %1;" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; -} - -static inline long atomic64_sub_return(long i, atomic64_t *v) -{ - return atomic64_add_return(-i, v); -} - -#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) -#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) - -static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) -{ - return cmpxchg(&v->counter, old, new); -} - -static inline long atomic64_xchg(atomic64_t *v, long new) -{ - return xchg(&v->counter, new); -} - static inline long atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); @@ -413,30 +223,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) -/** - * atomic64_add_unless - add unless the number is a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as it was not @u. - * Returns non-zero if @v was not @u, and zero otherwise. - */ -static inline int atomic64_add_unless(atomic64_t *v, long a, long u) -{ - long c, old; - c = atomic64_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic64_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); -} - /** * atomic_inc_short - increment of a short integer * @v: pointer to type int @@ -463,8 +249,6 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); } -#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) - /* These are x86-specific, used by some header files */ #define atomic_clear_mask(mask, addr) \ asm volatile(LOCK_PREFIX "andl %0,%1" \ @@ -481,5 +265,6 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() +#include #include #endif /* _ASM_X86_ATOMIC_64_H */ -- cgit v1.2.2 From 3ce59bb8352e1c53446bef1ead1c63956dfef64a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 7 Jan 2010 11:53:34 -0500 Subject: x86: Sync asm/atomic_32.h and asm/atomic_64.h Prepare for merging into asm/atomic.h. Signed-off-by: Brian Gerst LKML-Reference: <1262883215-4034-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/atomic_32.h | 42 ++++++++++++++++++--- arch/x86/include/asm/atomic_64.h | 81 ++++++++++++++++++++++++++-------------- 2 files changed, 89 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index e128ae988cc9..036962ef8203 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -4,6 +4,7 @@ #include #include #include +#include #include /* @@ -145,8 +146,8 @@ static inline int atomic_inc_and_test(atomic_t *v) /** * atomic_add_negative - add and test if negative - * @v: pointer of type atomic_t * @i: integer value to add + * @v: pointer of type atomic_t * * Atomically adds @i to @v and returns true * if the result is negative, or false when @@ -164,8 +165,8 @@ static inline int atomic_add_negative(int i, atomic_t *v) /** * atomic_add_return - add integer and return - * @v: pointer of type atomic_t * @i: integer value to add + * @v: pointer of type atomic_t * * Atomically adds @i to @v and returns @i + @v */ @@ -206,6 +207,9 @@ static inline int atomic_sub_return(int i, atomic_t *v) return atomic_add_return(-i, v); } +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); @@ -242,8 +246,33 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) -#define atomic_inc_return(v) (atomic_add_return(1, v)) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) +/** + * atomic_inc_short - increment of a short integer + * @v: pointer to type int + * + * Atomically adds 1 to @v + * Returns the new value of @u + */ +static inline short int atomic_inc_short(short int *v) +{ + asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); + return *v; +} + +#ifdef CONFIG_X86_64 +/** + * atomic_or_long - OR of two long integers + * @v1: pointer to type unsigned long + * @v2: pointer to type unsigned long + * + * Atomically ORs @v1 and @v2 + * Returns the result of the OR + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); +} +#endif /* These are x86-specific, used by some header files */ #define atomic_clear_mask(mask, addr) \ @@ -251,8 +280,9 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) : : "r" (~(mask)), "m" (*(addr)) : "memory") #define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" (mask), "m" (*(addr)) : "memory") + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" ((unsigned)(mask)), "m" (*(addr)) \ + : "memory") /* Atomic operations are already serializing on x86 */ #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 042c33100c69..77407887cfcd 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h @@ -1,7 +1,9 @@ #ifndef _ASM_X86_ATOMIC_64_H #define _ASM_X86_ATOMIC_64_H +#include #include +#include #include #include @@ -45,12 +47,12 @@ static inline void atomic_set(atomic_t *v, int i) static inline void atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" - : "=m" (v->counter) - : "ir" (i), "m" (v->counter)); + : "+m" (v->counter) + : "ir" (i)); } /** - * atomic_sub - subtract the atomic variable + * atomic_sub - subtract integer from atomic variable * @i: integer value to subtract * @v: pointer of type atomic_t * @@ -59,8 +61,8 @@ static inline void atomic_add(int i, atomic_t *v) static inline void atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" - : "=m" (v->counter) - : "ir" (i), "m" (v->counter)); + : "+m" (v->counter) + : "ir" (i)); } /** @@ -77,8 +79,8 @@ static inline int atomic_sub_and_test(int i, atomic_t *v) unsigned char c; asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "ir" (i), "m" (v->counter) : "memory"); + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); return c; } @@ -91,8 +93,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v) static inline void atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" - : "=m" (v->counter) - : "m" (v->counter)); + : "+m" (v->counter)); } /** @@ -104,8 +105,7 @@ static inline void atomic_inc(atomic_t *v) static inline void atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" - : "=m" (v->counter) - : "m" (v->counter)); + : "+m" (v->counter)); } /** @@ -121,8 +121,8 @@ static inline int atomic_dec_and_test(atomic_t *v) unsigned char c; asm volatile(LOCK_PREFIX "decl %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); + : "+m" (v->counter), "=qm" (c) + : : "memory"); return c != 0; } @@ -139,8 +139,8 @@ static inline int atomic_inc_and_test(atomic_t *v) unsigned char c; asm volatile(LOCK_PREFIX "incl %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); + : "+m" (v->counter), "=qm" (c) + : : "memory"); return c != 0; } @@ -158,13 +158,13 @@ static inline int atomic_add_negative(int i, atomic_t *v) unsigned char c; asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" - : "=m" (v->counter), "=qm" (c) - : "ir" (i), "m" (v->counter) : "memory"); + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); return c; } /** - * atomic_add_return - add and return + * atomic_add_return - add integer and return * @i: integer value to add * @v: pointer of type atomic_t * @@ -172,13 +172,36 @@ static inline int atomic_add_negative(int i, atomic_t *v) */ static inline int atomic_add_return(int i, atomic_t *v) { - int __i = i; + int __i; +#ifdef CONFIG_M386 + unsigned long flags; + if (unlikely(boot_cpu_data.x86 <= 3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; asm volatile(LOCK_PREFIX "xaddl %0, %1" : "+r" (i), "+m" (v->counter) : : "memory"); return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); + local_irq_restore(flags); + return i + __i; +#endif } +/** + * atomic_sub_return - subtract integer and return + * @v: pointer of type atomic_t + * @i: integer value to subtract + * + * Atomically subtracts @i from @v and returns @v - @i + */ static inline int atomic_sub_return(int i, atomic_t *v) { return atomic_add_return(-i, v); @@ -187,23 +210,23 @@ static inline int atomic_sub_return(int i, atomic_t *v) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) -static inline long atomic_cmpxchg(atomic_t *v, int old, int new) +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); } -static inline long atomic_xchg(atomic_t *v, int new) +static inline int atomic_xchg(atomic_t *v, int new) { return xchg(&v->counter, new); } /** - * atomic_add_unless - add unless the number is a given value + * atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. * - * Atomically adds @a to @v, so long as it was not @u. + * Atomically adds @a to @v, so long as @v was not already @u. * Returns non-zero if @v was not @u, and zero otherwise. */ static inline int atomic_add_unless(atomic_t *v, int a, int u) @@ -236,6 +259,7 @@ static inline short int atomic_inc_short(short int *v) return *v; } +#ifdef CONFIG_X86_64 /** * atomic_or_long - OR of two long integers * @v1: pointer to type unsigned long @@ -248,15 +272,16 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) { asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); } +#endif /* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "andl %0,%1" \ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ : : "r" (~(mask)), "m" (*(addr)) : "memory") -#define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" ((unsigned)(mask)), "m" (*(addr)) \ +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" ((unsigned)(mask)), "m" (*(addr)) \ : "memory") /* Atomic operations are already serializing on x86 */ -- cgit v1.2.2 From 5abbbbf0b0cd4abf5898136d0c345dc99b859c8c Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 7 Jan 2010 11:53:35 -0500 Subject: x86: Merge asm/atomic_{32,64}.h Merge the now identical code from asm/atomic_32.h and asm/atomic_64.h into asm/atomic.h. Signed-off-by: Brian Gerst LKML-Reference: <1262883215-4034-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/atomic.h | 299 ++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/atomic_32.h | 295 -------------------------------------- arch/x86/include/asm/atomic_64.h | 295 -------------------------------------- 3 files changed, 297 insertions(+), 592 deletions(-) delete mode 100644 arch/x86/include/asm/atomic_32.h delete mode 100644 arch/x86/include/asm/atomic_64.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 4e1b8873c474..8baaa719fa7f 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -1,5 +1,300 @@ +#ifndef _ASM_X86_ATOMIC_H +#define _ASM_X86_ATOMIC_H + +#include +#include +#include +#include +#include + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. + */ +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub - subtract integer from atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_sub_and_test(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. + */ +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_inc_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic_add_negative(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_add_return - add integer and return + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + int __i; +#ifdef CONFIG_M386 + unsigned long flags; + if (unlikely(boot_cpu_data.x86 <= 3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); + local_irq_restore(flags); + return i + __i; +#endif +} + +/** + * atomic_sub_return - subtract integer and return + * @v: pointer of type atomic_t + * @i: integer value to subtract + * + * Atomically subtracts @i from @v and returns @v - @i + */ +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline int atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} + +/** + * atomic_add_unless - add unless the number is already a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as @v was not already @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + +/** + * atomic_inc_short - increment of a short integer + * @v: pointer to type int + * + * Atomically adds 1 to @v + * Returns the new value of @u + */ +static inline short int atomic_inc_short(short int *v) +{ + asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); + return *v; +} + +#ifdef CONFIG_X86_64 +/** + * atomic_or_long - OR of two long integers + * @v1: pointer to type unsigned long + * @v2: pointer to type unsigned long + * + * Atomically ORs @v1 and @v2 + * Returns the result of the OR + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); +} +#endif + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ + : : "r" (~(mask)), "m" (*(addr)) : "memory") + +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" ((unsigned)(mask)), "m" (*(addr)) \ + : "memory") + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + #ifdef CONFIG_X86_32 -# include "atomic_32.h" +# include "atomic64_32.h" #else -# include "atomic_64.h" +# include "atomic64_64.h" #endif + +#include +#endif /* _ASM_X86_ATOMIC_H */ diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h deleted file mode 100644 index 036962ef8203..000000000000 --- a/arch/x86/include/asm/atomic_32.h +++ /dev/null @@ -1,295 +0,0 @@ -#ifndef _ASM_X86_ATOMIC_32_H -#define _ASM_X86_ATOMIC_32_H - -#include -#include -#include -#include -#include - -/* - * Atomic operations that C can't guarantee us. Useful for - * resource counting etc.. - */ - -#define ATOMIC_INIT(i) { (i) } - -/** - * atomic_read - read atomic variable - * @v: pointer of type atomic_t - * - * Atomically reads the value of @v. - */ -static inline int atomic_read(const atomic_t *v) -{ - return v->counter; -} - -/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ -static inline void atomic_set(atomic_t *v, int i) -{ - v->counter = i; -} - -/** - * atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v. - */ -static inline void atomic_add(int i, atomic_t *v) -{ - asm volatile(LOCK_PREFIX "addl %1,%0" - : "+m" (v->counter) - : "ir" (i)); -} - -/** - * atomic_sub - subtract integer from atomic variable - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v. - */ -static inline void atomic_sub(int i, atomic_t *v) -{ - asm volatile(LOCK_PREFIX "subl %1,%0" - : "+m" (v->counter) - : "ir" (i)); -} - -/** - * atomic_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline int atomic_sub_and_test(int i, atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -} - -/** - * atomic_inc - increment atomic variable - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1. - */ -static inline void atomic_inc(atomic_t *v) -{ - asm volatile(LOCK_PREFIX "incl %0" - : "+m" (v->counter)); -} - -/** - * atomic_dec - decrement atomic variable - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1. - */ -static inline void atomic_dec(atomic_t *v) -{ - asm volatile(LOCK_PREFIX "decl %0" - : "+m" (v->counter)); -} - -/** - * atomic_dec_and_test - decrement and test - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline int atomic_dec_and_test(atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "decl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; -} - -/** - * atomic_inc_and_test - increment and test - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline int atomic_inc_and_test(atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "incl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; -} - -/** - * atomic_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline int atomic_add_negative(int i, atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -} - -/** - * atomic_add_return - add integer and return - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns @i + @v - */ -static inline int atomic_add_return(int i, atomic_t *v) -{ - int __i; -#ifdef CONFIG_M386 - unsigned long flags; - if (unlikely(boot_cpu_data.x86 <= 3)) - goto no_xadd; -#endif - /* Modern 486+ processor */ - __i = i; - asm volatile(LOCK_PREFIX "xaddl %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; - -#ifdef CONFIG_M386 -no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); - __i = atomic_read(v); - atomic_set(v, i + __i); - local_irq_restore(flags); - return i + __i; -#endif -} - -/** - * atomic_sub_return - subtract integer and return - * @v: pointer of type atomic_t - * @i: integer value to subtract - * - * Atomically subtracts @i from @v and returns @v - @i - */ -static inline int atomic_sub_return(int i, atomic_t *v) -{ - return atomic_add_return(-i, v); -} - -#define atomic_inc_return(v) (atomic_add_return(1, v)) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) - -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) -{ - return cmpxchg(&v->counter, old, new); -} - -static inline int atomic_xchg(atomic_t *v, int new) -{ - return xchg(&v->counter, new); -} - -/** - * atomic_add_unless - add unless the number is already a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as @v was not already @u. - * Returns non-zero if @v was not @u, and zero otherwise. - */ -static inline int atomic_add_unless(atomic_t *v, int a, int u) -{ - int c, old; - c = atomic_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); -} - -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - -/** - * atomic_inc_short - increment of a short integer - * @v: pointer to type int - * - * Atomically adds 1 to @v - * Returns the new value of @u - */ -static inline short int atomic_inc_short(short int *v) -{ - asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); - return *v; -} - -#ifdef CONFIG_X86_64 -/** - * atomic_or_long - OR of two long integers - * @v1: pointer to type unsigned long - * @v2: pointer to type unsigned long - * - * Atomically ORs @v1 and @v2 - * Returns the result of the OR - */ -static inline void atomic_or_long(unsigned long *v1, unsigned long v2) -{ - asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); -} -#endif - -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "andl %0,%1" \ - : : "r" (~(mask)), "m" (*(addr)) : "memory") - -#define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" ((unsigned)(mask)), "m" (*(addr)) \ - : "memory") - -/* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - -#include -#include -#endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h deleted file mode 100644 index 77407887cfcd..000000000000 --- a/arch/x86/include/asm/atomic_64.h +++ /dev/null @@ -1,295 +0,0 @@ -#ifndef _ASM_X86_ATOMIC_64_H -#define _ASM_X86_ATOMIC_64_H - -#include -#include -#include -#include -#include - -/* - * Atomic operations that C can't guarantee us. Useful for - * resource counting etc.. - */ - -#define ATOMIC_INIT(i) { (i) } - -/** - * atomic_read - read atomic variable - * @v: pointer of type atomic_t - * - * Atomically reads the value of @v. - */ -static inline int atomic_read(const atomic_t *v) -{ - return v->counter; -} - -/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ -static inline void atomic_set(atomic_t *v, int i) -{ - v->counter = i; -} - -/** - * atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v. - */ -static inline void atomic_add(int i, atomic_t *v) -{ - asm volatile(LOCK_PREFIX "addl %1,%0" - : "+m" (v->counter) - : "ir" (i)); -} - -/** - * atomic_sub - subtract integer from atomic variable - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v. - */ -static inline void atomic_sub(int i, atomic_t *v) -{ - asm volatile(LOCK_PREFIX "subl %1,%0" - : "+m" (v->counter) - : "ir" (i)); -} - -/** - * atomic_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline int atomic_sub_and_test(int i, atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -} - -/** - * atomic_inc - increment atomic variable - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1. - */ -static inline void atomic_inc(atomic_t *v) -{ - asm volatile(LOCK_PREFIX "incl %0" - : "+m" (v->counter)); -} - -/** - * atomic_dec - decrement atomic variable - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1. - */ -static inline void atomic_dec(atomic_t *v) -{ - asm volatile(LOCK_PREFIX "decl %0" - : "+m" (v->counter)); -} - -/** - * atomic_dec_and_test - decrement and test - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline int atomic_dec_and_test(atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "decl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; -} - -/** - * atomic_inc_and_test - increment and test - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline int atomic_inc_and_test(atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "incl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; -} - -/** - * atomic_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline int atomic_add_negative(int i, atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -} - -/** - * atomic_add_return - add integer and return - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns @i + @v - */ -static inline int atomic_add_return(int i, atomic_t *v) -{ - int __i; -#ifdef CONFIG_M386 - unsigned long flags; - if (unlikely(boot_cpu_data.x86 <= 3)) - goto no_xadd; -#endif - /* Modern 486+ processor */ - __i = i; - asm volatile(LOCK_PREFIX "xaddl %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; - -#ifdef CONFIG_M386 -no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); - __i = atomic_read(v); - atomic_set(v, i + __i); - local_irq_restore(flags); - return i + __i; -#endif -} - -/** - * atomic_sub_return - subtract integer and return - * @v: pointer of type atomic_t - * @i: integer value to subtract - * - * Atomically subtracts @i from @v and returns @v - @i - */ -static inline int atomic_sub_return(int i, atomic_t *v) -{ - return atomic_add_return(-i, v); -} - -#define atomic_inc_return(v) (atomic_add_return(1, v)) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) - -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) -{ - return cmpxchg(&v->counter, old, new); -} - -static inline int atomic_xchg(atomic_t *v, int new) -{ - return xchg(&v->counter, new); -} - -/** - * atomic_add_unless - add unless the number is already a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as @v was not already @u. - * Returns non-zero if @v was not @u, and zero otherwise. - */ -static inline int atomic_add_unless(atomic_t *v, int a, int u) -{ - int c, old; - c = atomic_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); -} - -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - -/** - * atomic_inc_short - increment of a short integer - * @v: pointer to type int - * - * Atomically adds 1 to @v - * Returns the new value of @u - */ -static inline short int atomic_inc_short(short int *v) -{ - asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); - return *v; -} - -#ifdef CONFIG_X86_64 -/** - * atomic_or_long - OR of two long integers - * @v1: pointer to type unsigned long - * @v2: pointer to type unsigned long - * - * Atomically ORs @v1 and @v2 - * Returns the result of the OR - */ -static inline void atomic_or_long(unsigned long *v1, unsigned long v2) -{ - asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); -} -#endif - -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "andl %0,%1" \ - : : "r" (~(mask)), "m" (*(addr)) : "memory") - -#define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" ((unsigned)(mask)), "m" (*(addr)) \ - : "memory") - -/* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - -#include -#include -#endif /* _ASM_X86_ATOMIC_64_H */ -- cgit v1.2.2 From e1e0138d7d10fd447c71cc70f367eac514bd3ce4 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 7 Jan 2010 10:12:40 -0600 Subject: x86, uv: uv_global_gru_mmr_address() macro fix Fix bug in uv_global_gru_mmr_address macro. Macro failed to cast an int value to a long prior to a left shift > 32. Signed-off-by: Jack Steiner LKML-Reference: <20100107161240.GA2610@sgi.com> Cc: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/uv_hub.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 811bfabc80b7..7a81d9db57b9 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -335,7 +335,8 @@ static inline unsigned long uv_read_global_mmr64(int pnode, */ static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) { - return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val); + return UV_GLOBAL_GRU_MMR_BASE | offset | + ((unsigned long)pnode << uv_hub_info->m_val); } /* -- cgit v1.2.2 From 99659a929d653d0c9ce458091870544768add871 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 7 Jan 2010 15:35:42 +0100 Subject: x86, uv: Remove recursion in uv_heartbeat_enable() The recursion is not needed and does not improve readability. Signed-off-by: Roel Kluin LKML-Reference: <4B45F13E.3040202@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/x2apic_uv_x.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index af5d103bb533..d199dc34f54a 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -475,7 +475,7 @@ static void uv_heartbeat(unsigned long ignored) static void __cpuinit uv_heartbeat_enable(int cpu) { - if (!uv_cpu_hub_info(cpu)->scir.enabled) { + while (!uv_cpu_hub_info(cpu)->scir.enabled) { struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); @@ -483,11 +483,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu) timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_hub_info(cpu)->scir.enabled = 1; - } - /* check boot cpu */ - if (!uv_cpu_hub_info(0)->scir.enabled) - uv_heartbeat_enable(0); + /* also ensure that boot cpu is enabled */ + cpu = 0; + } } #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.2 From 4b529401c5089cf33f7165607cbc2fde43357bfb Mon Sep 17 00:00:00 2001 From: Andreas Fenkart Date: Fri, 8 Jan 2010 14:42:31 -0800 Subject: mm: make totalhigh_pages unsigned long Makes it consistent with the extern declaration, used when CONFIG_HIGHMEM is set Removes redundant casts in printout messages Signed-off-by: Andreas Fenkart Acked-by: Russell King Cc: Ralf Baechle Cc: David Howells Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Chen Liqin Cc: Lennox Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c973f8e2a6cf..9a0c258a86be 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -892,8 +892,7 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) - ); + totalhigh_pages << (PAGE_SHIFT-10)); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" -- cgit v1.2.2 From 13510997d600a076e064f10587a8f6d20f8fff41 Mon Sep 17 00:00:00 2001 From: Albin Tonnerre Date: Fri, 8 Jan 2010 14:42:45 -0800 Subject: x86: add support for LZO-compressed kernels The necessary changes to the x86 Kconfig and boot/compressed to allow the use of this new compression method Signed-off-by: Albin Tonnerre Acked-by: H. Peter Anvin Tested-by: Wu Zhangjin Cc: Ingo Molnar Cc: Thomas Gleixner Tested-by: Russell King Acked-by: Russell King Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/Makefile | 5 ++++- arch/x86/boot/compressed/misc.c | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 55298e891571..6bf1f1ac478c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -49,6 +49,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_KERNEL_LZO select HAVE_HW_BREAKPOINT select PERF_EVENTS select ANON_INODES diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f25bbd37765a..fbb47daf2459 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,7 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -49,10 +49,13 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(call if_changed,bzip2) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) +$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lzo) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma +suffix-$(CONFIG_KERNEL_LZO) := lzo quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 842b2a36174a..3b22fe8ab91b 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -162,6 +162,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzma.c" #endif +#ifdef CONFIG_KERNEL_LZO +#include "../../../../lib/decompress_unlzo.c" +#endif + static void scroll(void) { int i; -- cgit v1.2.2 From a29815a333c6c6e677294bbe5958e771d0aad3fd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jan 2010 16:28:09 +0200 Subject: core, x86: make LIST_POISON less deadly The list macros use LIST_POISON1 and LIST_POISON2 as undereferencable pointers in order to trap erronous use of freed list_heads. Unfortunately userspace can arrange for those pointers to actually be dereferencable, potentially turning an oops to an expolit. To avoid this allow architectures (currently x86_64 only) to override the default values for these pointers with truly-undereferencable values. This is easy on x86_64 as the virtual address space is large and contains areas that cannot be mapped. Other 64-bit architectures will likely find similar unmapped ranges. [ingo: switch to 0xdead000000000000 as the unmapped area] [ingo: add comments, cleanup] [jaswinder: eliminate sparse warnings] Acked-by: Linus Torvalds Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar Signed-off-by: Avi Kivity Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6bf1f1ac478c..cbcbfdee3ee0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1247,6 +1247,11 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xdead000000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE -- cgit v1.2.2 From 066000dd856709b6980123eb39b957fe26993f7b Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Mon, 11 Jan 2010 15:51:04 -0800 Subject: Revert "x86, apic: Use logical flat on intel with <= 8 logical cpus" Revert commit 2fbd07a5f5d1295fa9b0c0564ec27da7c276a75a, as this commit breaks an IBM platform with quad-core Xeon cpu's. According to Suresh, this might be an IBM platform issue, as on other Intel platforms with <= 8 logical cpu's, logical flat mode works fine irespective of physical apic id values (inline with the xapic architecture). Revert this for now because of the IBM platform breakage. Another version will be re-submitted after the complete analysis. Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Suresh Siddha Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/apic.c | 26 ++++++++++++++++++-------- arch/x86/kernel/apic/probe_64.c | 15 ++++----------- 2 files changed, 22 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index aa57c079c98f..e80f291472a4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -62,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. * - * On AMD, this determines the messaging protocol we can use: if all APIC IDs + * This determines the messaging protocol we can use: if all APIC IDs * are in the 0 ... 7 range, then we can use logical addressing which * has some performance advantages (better broadcasting). * @@ -1898,14 +1898,24 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - def_to_bigsmp = 1; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj + */ + if (max_physical_apicid >= 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: def_to_bigsmp = 1; + } } #endif diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c4cbd3080c1c..65edc180fc82 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,23 +64,16 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif if (apic == &apic_flat) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - apic = &apic_physflat; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - apic = &apic_physflat; - } + if (max_physical_apicid >= 8) + apic = &apic_physflat; + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); - if (is_vsmp_box()) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; -- cgit v1.2.2 From 59c33fa7791e9948ba467c2b83e307a0d087ab49 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Jan 2010 16:21:09 -0800 Subject: x86-32: clean up rwsem inline asm statements This makes gcc use the right register names and instruction operand sizes automatically for the rwsem inline asm statements. So instead of using "(%%eax)" to specify the memory address that is the semaphore, we use "(%1)" or similar. And instead of forcing the operation to always be 32-bit, we use "%z0", taking the size from the actual semaphore data structure itself. This doesn't actually matter on x86-32, but if we want to use the same inline asm for x86-64, we'll need to have the compiler generate the proper 64-bit names for the registers (%rax instead of %eax), and if we want to use a 64-bit counter too (in order to avoid the 15-bit limit on the write counter that limits concurrent users to 32767 threads), we'll need to be able to generate instructions with "q" accesses rather than "l". Since this header currently isn't enabled on x86-64, none of that matters, but we do want to use the xadd version of the semaphores rather than have to take spinlocks to do a rwsem. The mm->mmap_sem can be heavily contended when you have lots of threads all taking page faults, and the fallback rwsem code that uses a spinlock performs abysmally badly in that case. [ hpa: modified the patch to skip size suffixes entirely when they are redundant due to register operands. ] Signed-off-by: Linus Torvalds LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index ca7517d33776..413620024768 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -105,7 +105,7 @@ do { \ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" - LOCK_PREFIX " incl (%%eax)\n\t" + LOCK_PREFIX " inc%z0 (%1)\n\t" /* adds 0x00000001, returns the old value */ " jns 1f\n" " call call_rwsem_down_read_failed\n" @@ -123,12 +123,12 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) { __s32 result, tmp; asm volatile("# beginning __down_read_trylock\n\t" - " movl %0,%1\n\t" + " mov %0,%1\n\t" "1:\n\t" - " movl %1,%2\n\t" - " addl %3,%2\n\t" + " mov %1,%2\n\t" + " add %3,%2\n\t" " jle 2f\n\t" - LOCK_PREFIX " cmpxchgl %2,%0\n\t" + LOCK_PREFIX " cmpxchg %2,%0\n\t" " jnz 1b\n\t" "2:\n\t" "# ending __down_read_trylock\n\t" @@ -147,9 +147,9 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtract 0x0000ffff, returns the old value */ - " testl %%edx,%%edx\n\t" + " test %1,%1\n\t" /* was the count 0 before? */ " jz 1f\n" " call call_rwsem_down_write_failed\n" @@ -185,7 +185,7 @@ static inline void __up_read(struct rw_semaphore *sem) { __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ " jns 1f\n\t" " call call_rwsem_wake\n" @@ -201,18 +201,18 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { + unsigned long tmp; asm volatile("# beginning __up_write\n\t" - " movl %2,%%edx\n\t" - LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ " jz 1f\n" " call call_rwsem_wake\n" "1:\n\t" "# ending __up_write\n" - : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc", "edx"); + : "+m" (sem->count), "=d" (tmp) + : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS) + : "memory", "cc"); } /* @@ -221,7 +221,7 @@ static inline void __up_write(struct rw_semaphore *sem) static inline void __downgrade_write(struct rw_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX " addl %2,(%%eax)\n\t" + LOCK_PREFIX " add%z0 %2,(%1)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" @@ -237,7 +237,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) */ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) { - asm volatile(LOCK_PREFIX "addl %1,%0" + asm volatile(LOCK_PREFIX "add%z0 %1,%0" : "+m" (sem->count) : "ir" (delta)); } -- cgit v1.2.2 From 2ca49b2fcf5813571663c3c4c894b78148c43690 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 4 Jan 2010 09:47:35 -0500 Subject: x86: Macroise x86 cache descriptors Use a macro to define the cache sizes when cachesize > 1 MB. This is less typing, and less prone to introducing bugs like we saw in e02e0e1a130b9ca37c5186d38ad4b3aaf58bb149, and means we don't have to do maths when adding new non-power-of-2 updates like those seen recently. Signed-off-by: Dave Jones LKML-Reference: <20100104144735.GA18390@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 84 ++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc6c8ef92dcc..c2b722d5a722 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -31,6 +31,8 @@ struct _cache_table { short size; }; +#define MB(x) ((x) * 1024) + /* All the cache descriptor types we care about (no TLB or trace cache entries) */ @@ -44,9 +46,9 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ @@ -59,16 +61,16 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ - { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ + { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ + { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ + { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ + { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ + { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ + { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ @@ -77,34 +79,34 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ - { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ + { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ + { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ + { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ + { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ + { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ + { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ + { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ + { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ + { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; -- cgit v1.2.2 From c2c5d45d46c8c0fd34291dec958670ad4816796f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Dec 2009 03:52:25 +0100 Subject: perf: Stop stack frame walking off kernel addresses boundaries While processing kernel perf callchains, an bad entry can be considered as a valid stack pointer but not as a kernel address. In this case, we hang in an endless loop. This can happen in an x86-32 kernel after processing the last entry in a kernel stacktrace. Just stop the stack frame walking after we encounter an invalid kernel address. This fixes a hard lockup in x86-32. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1262227945-27014-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c56bc2873030..6d817554780a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -123,13 +123,15 @@ print_context_stack_bp(struct thread_info *tinfo, while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { unsigned long addr = *ret_addr; - if (__kernel_text_address(addr)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); - } + if (!__kernel_text_address(addr)) + break; + + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); } + return (unsigned long)frame; } EXPORT_SYMBOL_GPL(print_context_stack_bp); -- cgit v1.2.2 From 0fb8ee48d9dfff6a0913ceb0be2068d8be203763 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Dec 2009 05:53:03 +0100 Subject: perf: Drop useless check for ignored frame The check that ignores the debug and nmi stack frames is useless now that we have a frame pointer that makes us start at the right place. We don't anymore have to deal with these. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1262235183-5320-2-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 2 -- arch/x86/kernel/cpu/perf_event.c | 8 -------- arch/x86/kernel/dumpstack_32.c | 5 ----- arch/x86/kernel/dumpstack_64.c | 5 ----- 4 files changed, 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 35e89122a42f..4dab78edbad9 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -3,8 +3,6 @@ extern int kstack_depth_to_print; -int x86_is_stack_id(int id, char *name); - struct thread_info; struct stacktrace_ops; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d616c06e99b4..b1bb8c550526 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2297,7 +2297,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_ignored_frame); static void @@ -2313,10 +2312,6 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_ignored_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name) || - x86_is_stack_id(DEBUG_STACK, name); - return 0; } @@ -2324,9 +2319,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_ignored_frame, smp_processor_id())) - return; - if (reliable) callchain_store(entry, addr); } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index ae775ca47b25..11540a189d93 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -18,11 +18,6 @@ #include "dumpstack.h" -/* Just a stub for now */ -int x86_is_stack_id(int id, char *name) -{ - return 0; -} void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f5..676bc051252e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = { #endif }; -int x86_is_stack_id(int id, char *name) -{ - return x86_stack_ids[id - 1] == name; -} - static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) { -- cgit v1.2.2 From aa5add93e92019018e905146f8c3d3f8e3c08300 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jan 2010 17:46:56 -0500 Subject: x86/ptrace: Remove unused regs_get_argument_nth API Because of dropping function argument syntax from kprobe-tracer, we don't need this API anymore. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE Cc: Frederic Weisbecker Cc: Roland McGrath Cc: Oleg Nesterov Cc: Mahesh Salgaonkar Cc: Benjamin Herrenschmidt Cc: Michael Neuling Cc: Steven Rostedt Cc: linuxppc-dev@ozlabs.org LKML-Reference: <20100105224656.19431.92588.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 4 ---- arch/x86/kernel/ptrace.c | 24 ------------------------ 2 files changed, 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 9d369f680321..20102808b191 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -274,10 +274,6 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, return 0; } -/* Get Nth argument at function call */ -extern unsigned long regs_get_argument_nth(struct pt_regs *regs, - unsigned int n); - /* * These are defined as per linux/ptrace.h, which see. */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..73554a3aae8c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -140,30 +140,6 @@ static const int arg_offs_table[] = { #endif }; -/** - * regs_get_argument_nth() - get Nth argument at function call - * @regs: pt_regs which contains registers at function entry. - * @n: argument number. - * - * regs_get_argument_nth() returns @n th argument of a function call. - * Since usually the kernel stack will be changed right after function entry, - * you must use this at function entry. If the @n th entry is NOT in the - * kernel stack or pt_regs, this returns 0. - */ -unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) -{ - if (n < ARRAY_SIZE(arg_offs_table)) - return *(unsigned long *)((char *)regs + arg_offs_table[n]); - else { - /* - * The typical case: arg n is on the stack. - * (Note: stack[0] = return address, so skip it) - */ - n -= ARRAY_SIZE(arg_offs_table); - return regs_get_kernel_stack_nth(regs, 1 + n); - } -} - /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. -- cgit v1.2.2 From df39a2e48f99e2d706e8fa4dc99fd148eb59449d Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 4 Jan 2010 16:17:21 +0000 Subject: x86: mce.h: Fix warning in header checks Someone isn't reading their build output: Move the definition out of the exported header. Signed-off-by: Alan Cox Cc: linux-kernel@vger.kernelorg Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 858baa061cfc..6c3fdd631ed3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -108,10 +108,11 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) -extern struct atomic_notifier_head x86_mce_decoder_chain; #ifdef __KERNEL__ +extern struct atomic_notifier_head x86_mce_decoder_chain; + #include #include #include -- cgit v1.2.2 From fcfbb2b5facd65efa7284cc315225bfe3d1856c2 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 8 Jan 2010 12:13:54 -0800 Subject: x86: SGI UV: Fix mapping of MMIO registers This fixes the problem of the initialization code not correctly mapping the entire MMIO space on a UV system. A side effect is the map_high() interface needed to be changed to accommodate different address and size shifts. Signed-off-by: Mike Travis Reviewed-by: Mike Habeck Cc: Cc: Jack Steiner Cc: Linus Torvalds LKML-Reference: <4B479202.7080705@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 5f92494dab61..b8bb869a6618 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -374,13 +374,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, - int max_pnode, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int pshift, + int bshift, int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; - paddr = base << shift; - bytes = (1UL << shift) * (max_pnode + 1); + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -396,7 +396,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) { - map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)gru.s.base << shift); gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); @@ -410,7 +410,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); + map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -420,7 +420,8 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); + map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, + max_pnode, map_uc); } static __init void map_low_mmrs(void) -- cgit v1.2.2 From 42590a75019a50012f25a962246498dead428433 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 4 Jan 2010 16:16:23 +0900 Subject: x86/agp: Fix agp_amd64_init and agp_amd64_cleanup This fixes the regression introduced by the commit f405d2c02395a74d3883bd03ded36457aa3697ad. The above commit fixes the following issue: http://marc.info/?l=linux-kernel&m=126192729110083&w=2 However, it doesn't work properly when you remove and insert the agp_amd64 module again. agp_amd64_init() and agp_amd64_cleanup should be called only when gart_iommu is not called earlier (that is, the GART IOMMU is not enabled). We need to use 'gart_iommu_aperture' to see if GART IOMMU is enabled or not. Signed-off-by: FUJITA Tomonori Cc: mitov@issp.bas.bg Cc: davej@redhat.com LKML-Reference: <20100104161603L.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3704997e8b25..f147a95fd84a 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -31,6 +31,7 @@ #include int gart_iommu_aperture; +EXPORT_SYMBOL_GPL(gart_iommu_aperture); int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; -- cgit v1.2.2 From 864a0922dd128392467611d9857e5138c6a91999 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 13 Jan 2010 10:16:07 +0000 Subject: x86: kernel_thread() -- initialize SS to a known state Before the kernel_thread was converted into "C" we had pt_regs::ss set to __KERNEL_DS (by SAVE_ALL asm macro). Though I must admit I didn't find any *explicit* load of %ss from this structure the better to be on a safe side and set it to a known value. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ian Campbell Cc: Christian Kujau Cc: Jeremy Fitzhardinge Cc: Brian Gerst LKML-Reference: <1263377768-19600-1-git-send-email-ian.campbell@citrix.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c6ee241c8a98..02c3ee013ccd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -288,6 +288,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; regs.gs = __KERNEL_STACK_CANARY; +#else + regs.ss = __KERNEL_DS; #endif regs.orig_ax = -1; -- cgit v1.2.2 From e68266b7001a4e29af083716f0c36c0d6dbb1b39 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 13 Jan 2010 10:16:08 +0000 Subject: x86: xen: 64-bit kernel RPL should be 0 Under Xen 64 bit guests actually run their kernel in ring 3, however the hypervisor takes care of squashing descriptor the RPLs transparently (in order to allow them to continue to differentiate between user and kernel space CS using the RPL). Therefore the Xen paravirt backend should use RPL==0 instead of 1 (or 3). Using RPL==1 causes generic arch code to take incorrect code paths because it uses "testl $3, , je foo" type tests for a userspace CS and this considers 1==userspace. This issue was previously masked because get_kernel_rpl() was omitted when setting CS in kernel_thread(). This was fixed when kernel_thread() was unified with 32 bit in f443ff4201dd25cd4dec183f9919ecba90c8edc2. Signed-off-by: Ian Campbell Cc: Christian Kujau Cc: Jeremy Fitzhardinge Cc: Cyrill Gorcunov Cc: Brian Gerst LKML-Reference: <1263377768-19600-2-git-send-email-ian.campbell@citrix.com> Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2b26dd5930c6..36daccb68642 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1151,9 +1151,13 @@ asmlinkage void __init xen_start_kernel(void) /* keep using Xen gdt for now; no urgent need to change it */ +#ifdef CONFIG_X86_32 pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; +#else + pv_info.kernel_rpl = 0; +#endif /* set the limit of our address space */ xen_reserve_top(); -- cgit v1.2.2 From 557a701c16553b0b691dbb64ef30361115a80f64 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Mon, 14 Dec 2009 11:44:15 +0100 Subject: [CPUFREQ] Fix use after free of struct powernow_k8_data Easy fix for a regression introduced in 2.6.31. On managed CPUs the cpufreq.c core will call driver->exit(cpu) on the managed cpus and powernow_k8 will free the core's data. Later driver->get(cpu) function might get called trying to read out the current freq of a managed cpu and the NULL pointer check does not work on the freed object -> better set it to NULL. ->get() is unsigned and must return 0 as invalid frequency. Reference: http://bugzilla.kernel.org/show_bug.cgi?id=14391 Signed-off-by: Thomas Renninger Tested-by: Michal Schmidt CC: stable@kernel.org Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a9df9441a9a2..2da4fa3bf6e9 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) kfree(data->powernow_table); kfree(data); + per_cpu(powernow_data, pol->cpu) = NULL; return 0; } @@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu) int err; if (!data) - return -EINVAL; + return 0; smp_call_function_single(cpu, query_values_on_cpu, &err, true); if (err) -- cgit v1.2.2 From 0f1d683fb35d6c6f49ef696c95757f3970682a0e Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Thu, 17 Dec 2009 20:18:27 +0000 Subject: [CPUFREQ] Processor Clocking Control interface driver Processor Clocking Control (PCC) is an interface between the BIOS and OSPM. Based on the server workload, OSPM can request what frequency it expects from a logical CPU, and the BIOS will achieve that frequency transparently. This patch introduces driver support for PCC. OSPM uses the PCC driver to communicate with the BIOS via the PCC interface. There is a Documentation file that provides a link to the PCC Specification, and also provides a summary of the PCC interface. Currently, certain HP ProLiant platforms implement the PCC interface. However, any platform whose BIOS implements the PCC Specification, can utilize this driver. V2 --> V1 changes (based on Dominik's suggestions): - Removed the dependency on CPU_FREQ_TABLE - "cpufreq_stats" will no longer PANIC. Actually, it will not load anymore because it is not applicable. - Removed the sanity check for target frequency in the ->target routine. NOTE: A patch to sanitize the target frequency requested by "ondemand" is needed to ensure that the target freq < policy->min. Can this driver be queued up for the 2.6.33 tree? Signed-off-by: Naga Chumbalkar Signed-off-by: Matthew Garrett Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/Kconfig | 14 + arch/x86/kernel/cpu/cpufreq/Makefile | 1 + arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 621 ++++++++++++++++++++++++++++++ 3 files changed, 636 insertions(+) create mode 100644 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index f138c6c389b9..870e6cc6ad28 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -10,6 +10,20 @@ if CPU_FREQ comment "CPUFreq processor drivers" +config X86_PCC_CPUFREQ + tristate "Processor Clocking Control interface driver" + depends on ACPI && ACPI_PROCESSOR + help + This driver adds support for the PCC interface. + + For details, take a look at: + . + + To compile this driver as a module, choose M here: the + module will be called pcc-cpufreq. + + If in doubt, say N. + config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 509296df294d..1840c0a5170b 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c new file mode 100644 index 000000000000..29368854533c --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -0,0 +1,621 @@ +/* + * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + * Copyright (C) 2009 Red Hat, Matthew Garrett + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define PCC_VERSION "1.00.00" +#define POLL_LOOPS 300 + +#define CMD_COMPLETE 0x1 +#define CMD_GET_FREQ 0x0 +#define CMD_SET_FREQ 0x1 + +#define BUF_SZ 4 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ + "pcc-cpufreq", msg) + +struct pcc_register_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 resource_usage; + u8 type_specific; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { + u32 signature; + u16 length; + u8 major; + u8 minor; + u32 features; + u16 command; + u16 status; + u32 latency; + u32 minimum_time; + u32 maximum_time; + u32 nominal; + u32 throttled_frequency; + u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, + 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { + u32 input_offset; + u32 output_offset; +}; + +static struct pcc_cpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static inline void pcc_cmd(void) +{ + u64 doorbell_value; + int i; + + acpi_read(&doorbell_value, &doorbell); + acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, + &doorbell); + + for (i = 0; i < POLL_LOOPS; i++) { + if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) + break; + } +} + +static inline void pcc_clear_mapping(void) +{ + if (pcch_virt_addr) + iounmap(pcch_virt_addr); + pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ + struct pcc_cpu *pcc_cpu_data; + unsigned int curr_freq; + unsigned int freq_limit; + u16 status; + u32 input_buffer; + u32 output_buffer; + + spin_lock(&pcc_lock); + + dprintk("get: get_freq for CPU %d\n", cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + input_buffer = 0x1; + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + output_buffer = + ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("get: FAILED: for CPU %d, status is %d\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) + / 100) * 1000); + + dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", + cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), + output_buffer, curr_freq); + + freq_limit = (output_buffer >> 8) & 0xff; + if (freq_limit != 0xff) { + dprintk("get: frequency for cpu %d is being temporarily" + " capped at %d\n", cpu, curr_freq); + } + + spin_unlock(&pcc_lock); + return curr_freq; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct pcc_cpu *pcc_cpu_data; + struct cpufreq_freqs freqs; + u16 status; + u32 input_buffer; + int cpu; + + spin_lock(&pcc_lock); + cpu = policy->cpu; + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + dprintk("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%x\n", + cpu, target_freq, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + + freqs.new = target_freq; + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + input_buffer = 0x1 | (((target_freq * 100) + / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("target: FAILED for cpu %d, with status: 0x%x\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); + spin_unlock(&pcc_lock); + + return 0; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ + acpi_status status; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *pccp, *offset; + struct pcc_cpu *pcc_cpu_data; + struct acpi_processor *pr; + int ret = 0; + + pr = per_cpu(processors, cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + pccp = buffer.pointer; + if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + }; + + offset = &(pccp->package.elements[0]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->input_offset = offset->integer.value; + + offset = &(pccp->package.elements[1]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->output_offset = offset->integer.value; + + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + + dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " + "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", + cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: + kfree(buffer.pointer); + return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ + acpi_status status; + struct acpi_object_list input; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u32 capabilities[2]; + u32 errors; + u32 supported; + int ret = 0; + + input.count = 4; + input.pointer = in_params; + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 2; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 8; + in_params[3].buffer.pointer = (u8 *)&capabilities; + + capabilities[0] = OSC_QUERY_ENABLE; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + + kfree(output.pointer); + capabilities[0] = 0x0; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + +out_free: + kfree(output.pointer); + return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ + acpi_status status; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + struct pcc_memory_resource *mem_resource; + struct pcc_register_resource *reg_resource; + union acpi_object *out_obj, *member; + acpi_handle handle, osc_handle; + int ret = 0; + + status = acpi_get_handle(NULL, "\\_SB", &handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "_OSC", &osc_handle); + if (ACPI_SUCCESS(status)) { + ret = pcc_cpufreq_do_osc(&osc_handle); + if (ret) + dprintk("probe: _OSC evaluation did not succeed\n"); + /* Firmware's use of _OSC is optional */ + ret = 0; + } + + status = acpi_evaluate_object(handle, "PCCH", NULL, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + } + + member = &out_obj->package.elements[0]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + + dprintk("probe: mem_resource descriptor: 0x%x," + " length: %d, space_id: %d, resource_usage: %d," + " type_specific: %d, granularity: 0x%llx," + " minimum: 0x%llx, maximum: 0x%llx," + " translation_offset: 0x%llx, address_length: 0x%llx\n", + mem_resource->descriptor, mem_resource->length, + mem_resource->space_id, mem_resource->resource_usage, + mem_resource->type_specific, mem_resource->granularity, + mem_resource->minimum, mem_resource->maximum, + mem_resource->translation_offset, + mem_resource->address_length); + + if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + ret = -ENODEV; + goto out_free; + } + + pcch_virt_addr = ioremap_nocache(mem_resource->minimum, + mem_resource->address_length); + if (pcch_virt_addr == NULL) { + dprintk("probe: could not map shared mem region\n"); + goto out_free; + } + pcch_hdr = pcch_virt_addr; + + dprintk("probe: PCCH header (virtual) addr: 0x%llx\n", + (u64)pcch_hdr); + dprintk("probe: PCCH header is at physical address: 0x%llx," + " signature: 0x%x, length: %d bytes, major: %d, minor: %d," + " supported features: 0x%x, command field: 0x%x," + " status field: 0x%x, nominal latency: %d us\n", + mem_resource->minimum, ioread32(&pcch_hdr->signature), + ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), + ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), + ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), + ioread32(&pcch_hdr->latency)); + + dprintk("probe: min time between commands: %d us," + " max time between commands: %d us," + " nominal CPU frequency: %d MHz," + " minimum CPU frequency: %d MHz," + " minimum CPU frequency without throttling: %d MHz\n", + ioread32(&pcch_hdr->minimum_time), + ioread32(&pcch_hdr->maximum_time), + ioread32(&pcch_hdr->nominal), + ioread32(&pcch_hdr->throttled_frequency), + ioread32(&pcch_hdr->minimum_frequency)); + + member = &out_obj->package.elements[1]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto pcch_free; + } + + reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + + doorbell.space_id = reg_resource->space_id; + doorbell.bit_width = reg_resource->bit_width; + doorbell.bit_offset = reg_resource->bit_offset; + doorbell.access_width = 64; + doorbell.address = reg_resource->address; + + dprintk("probe: doorbell: space_id is %d, bit_width is %d, " + "bit_offset is %d, access_width is %d, address is 0x%llx\n", + doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, + doorbell.access_width, reg_resource->address); + + member = &out_obj->package.elements[2]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_preserve = member->integer.value; + + member = &out_obj->package.elements[3]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_write = member->integer.value; + + dprintk("probe: doorbell_preserve: 0x%llx," + " doorbell_write: 0x%llx\n", + doorbell_preserve, doorbell_write); + + pcc_cpu_info = alloc_percpu(struct pcc_cpu); + if (!pcc_cpu_info) { + ret = -ENOMEM; + goto pcch_free; + } + + printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" + " limits: %d MHz, %d MHz\n", PCC_VERSION, + ioread32(&pcch_hdr->minimum_frequency), + ioread32(&pcch_hdr->nominal)); + kfree(output.pointer); + return ret; +pcch_free: + pcc_clear_mapping(); +out_free: + kfree(output.pointer); + return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + unsigned int result = 0; + + if (!pcch_virt_addr) { + result = -1; + goto pcch_null; + } + + result = pcc_get_offset(cpu); + if (result) { + dprintk("init: PCCP evaluation failed\n"); + goto free; + } + + policy->max = policy->cpuinfo.max_freq = + ioread32(&pcch_hdr->nominal) * 1000; + policy->min = policy->cpuinfo.min_freq = + ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cur = pcc_get_freq(cpu); + + dprintk("init: policy->max is %d, policy->min is %d\n", + policy->max, policy->min); + + return 0; +free: + pcc_clear_mapping(); + free_percpu(pcc_cpu_info); +pcch_null: + return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .get = pcc_get_freq, + .verify = pcc_cpufreq_verify, + .target = pcc_cpufreq_target, + .init = pcc_cpufreq_cpu_init, + .exit = pcc_cpufreq_cpu_exit, + .name = "pcc-cpufreq", + .owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + ret = pcc_cpufreq_probe(); + if (ret) { + dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); + return ret; + } + + ret = cpufreq_register_driver(&pcc_cpufreq_driver); + + return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&pcc_cpufreq_driver); + + pcc_clear_mapping(); + + free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); -- cgit v1.2.2 From fb4635932a4e19c2f55383f968a0e9b64da37354 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 7 Jan 2010 15:26:22 -0500 Subject: [CPUFREQ] Fix cast warning in pcc driver. arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c:458: warning: cast from pointer to integer of different size Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 29368854533c..ff36d2979a90 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -455,8 +455,7 @@ static int __init pcc_cpufreq_probe(void) } pcch_hdr = pcch_virt_addr; - dprintk("probe: PCCH header (virtual) addr: 0x%llx\n", - (u64)pcch_hdr); + dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); dprintk("probe: PCCH header is at physical address: 0x%llx," " signature: 0x%x, length: %d bytes, major: %d, minor: %d," " supported features: 0x%x, command field: 0x%x," -- cgit v1.2.2 From 3bef444797f7624f8fbd27f4e0334ce96a108725 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 13 Jan 2010 10:45:55 -0500 Subject: x86: Merge show_regs() Using kernel_stack_pointer() allows 32-bit and 64-bit versions to be merged. This is more correct for 64-bit, since the old %rsp is always saved on the stack. Signed-off-by: Brian Gerst LKML-Reference: <1263397555-27695-1-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process.c | 7 +++++++ arch/x86/kernel/process_32.c | 6 ------ arch/x86/kernel/process_64.c | 6 ------ 3 files changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 98c2cdeb599e..cf1e04b2ad65 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -92,6 +92,13 @@ void exit_thread(void) } } +void show_regs(struct pt_regs *regs) +{ + show_registers(regs); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), + regs->bp); +} + void show_regs_common(void) { const char *board, *product; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9c517b5858f0..fe6a34e42bde 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -174,12 +174,6 @@ void __show_regs(struct pt_regs *regs, int all) d6, d7); } -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, ®s->sp, regs->bp); -} - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 52fbd0c60198..418f860880a2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -211,12 +211,6 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, (void *)(regs + 1), regs->bp); -} - void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { -- cgit v1.2.2 From 5d0b7235d83eefdafda300656e97d368afcafc9a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Jan 2010 17:57:35 -0800 Subject: x86: clean up rwsem type system The fast version of the rwsems (the code that uses xadd) has traditionally only worked on x86-32, and as a result it mixes different kinds of types wildly - they just all happen to be 32-bit. We have "long", we have "__s32", and we have "int". To make it work on x86-64, the types suddenly matter a lot more. It can be either a 32-bit or 64-bit signed type, and both work (with the caveat that a 32-bit counter will only have 15 bits of effective write counters, so it's limited to 32767 users). But whatever type you choose, it needs to be used consistently. This makes a new 'rwsem_counter_t', that is a 32-bit signed type. For a 64-bit type, you'd need to also update the BIAS values. Signed-off-by: Linus Torvalds LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 413620024768..5f9af3081d66 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -55,6 +55,9 @@ extern asmregparm struct rw_semaphore * /* * the semaphore definition + * + * The bias values and the counter type needs to be extended to 64 bits + * if we want to have more than 32767 potential readers/writers */ #define RWSEM_UNLOCKED_VALUE 0x00000000 @@ -64,8 +67,10 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +typedef signed int rwsem_count_t; + struct rw_semaphore { - signed long count; + rwsem_count_t count; spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -121,7 +126,7 @@ static inline void __down_read(struct rw_semaphore *sem) */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - __s32 result, tmp; + rwsem_count_t result, tmp; asm volatile("# beginning __down_read_trylock\n\t" " mov %0,%1\n\t" "1:\n\t" @@ -143,7 +148,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - int tmp; + rwsem_count_t tmp; tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" @@ -170,9 +175,9 @@ static inline void __down_write(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - signed long ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + rwsem_count_t ret = cmpxchg(&sem->count, + RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -183,7 +188,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ @@ -201,7 +206,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - unsigned long tmp; + rwsem_count_t tmp; asm volatile("# beginning __up_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* tries to transition @@ -245,9 +250,9 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline rwsem_count_t rwsem_atomic_update(int delta, struct rw_semaphore *sem) { - int tmp = delta; + rwsem_count_t tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) -- cgit v1.2.2 From bafaecd11df15ad5b1e598adc7736afcd38ee13d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Jan 2010 18:16:42 -0800 Subject: x86-64: support native xadd rwsem implementation This one is much faster than the spinlock based fallback rwsem code, with certain artifical benchmarks having shown 300%+ improvement on threaded page faults etc. Again, note the 32767-thread limit here. So this really does need that whole "make rwsem_count_t be 64-bit and fix the BIAS values to match" extension on top of it, but that is conceptually a totally independent issue. NOT TESTED! The original patch that this all was based on were tested by KAMEZAWA Hiroyuki, but maybe I screwed up something when I created the cleaned-up series, so caveat emptor.. Also note that it _may_ be a good idea to mark some more registers clobbered on x86-64 in the inline asms instead of saving/restoring them. They are inline functions, but they are only used in places where there are not a lot of live registers _anyway_, so doing for example the clobbers of %r8-%r11 in the asm wouldn't make the fast-path code any worse, and would make the slow-path code smaller. (Not that the slow-path really matters to that degree. Saving a few unnecessary registers is the _least_ of our problems when we hit the slow path. The instruction/cycle counting really only matters in the fast path). Signed-off-by: Linus Torvalds LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig.cpu | 2 +- arch/x86/lib/Makefile | 1 + arch/x86/lib/rwsem_64.S | 81 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 arch/x86/lib/rwsem_64.S (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 08e442bc3ab9..9d38a13b4ceb 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT config X86_XADD def_bool y - depends on X86_32 && !M386 + depends on X86_64 || !M386 config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index cffd754f3039..c80245131fdc 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -39,4 +39,5 @@ else lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o + lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o endif diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S new file mode 100644 index 000000000000..15acecf0d7aa --- /dev/null +++ b/arch/x86/lib/rwsem_64.S @@ -0,0 +1,81 @@ +/* + * x86-64 rwsem wrappers + * + * This interfaces the inline asm code to the slow-path + * C routines. We need to save the call-clobbered regs + * that the asm does not mark as clobbered, and move the + * argument from %rax to %rdi. + * + * NOTE! We don't need to save %rax, because the functions + * will always return the semaphore pointer in %rax (which + * is also the input argument to these helpers) + * + * The following can clobber %rdx because the asm clobbers it: + * call_rwsem_down_write_failed + * call_rwsem_wake + * but %rdi, %rsi, %rcx, %r8-r11 always need saving. + */ + +#include +#include +#include +#include +#include + +#define save_common_regs \ + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 + +#define restore_common_regs \ + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_down_read_failed + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed + restore_common_regs + ret + ENDPROC(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + decw %dx /* do nothing if still outstanding active readers */ + jnz 1f + save_common_regs + movq %rax,%rdi + call rwsem_wake + restore_common_regs +1: ret + ENDPROC(call_rwsem_wake) + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_downgrade_wake) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_downgrade_wake + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_downgrade_wake) -- cgit v1.2.2 From 3a4d5c94e959359ece6d6b55045c3f046677f55c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 14 Jan 2010 06:17:27 +0000 Subject: vhost_net: a kernel-level virtio server What it is: vhost net is a character device that can be used to reduce the number of system calls involved in virtio networking. Existing virtio net code is used in the guest without modification. There's similarity with vringfd, with some differences and reduced scope - uses eventfd for signalling - structures can be moved around in memory at any time (good for migration, bug work-arounds in userspace) - write logging is supported (good for migration) - support memory table and not just an offset (needed for kvm) common virtio related code has been put in a separate file vhost.c and can be made into a separate module if/when more backends appear. I used Rusty's lguest.c as the source for developing this part : this supplied me with witty comments I wouldn't be able to write myself. What it is not: vhost net is not a bus, and not a generic new system call. No assumptions are made on how guest performs hypercalls. Userspace hypervisors are supported as well as kvm. How it works: Basically, we connect virtio frontend (configured by userspace) to a backend. The backend could be a network device, or a tap device. Backend is also configured by userspace, including vlan/mac etc. Status: This works for me, and I haven't see any crashes. Compared to userspace, people reported improved latency (as I save up to 4 system calls per packet), as well as better bandwidth and CPU utilization. Features that I plan to look at in the future: - mergeable buffers - zero copy - scalability tuning: figure out the best threading model to use Note on RCU usage (this is also documented in vhost.h, near private_pointer which is the value protected by this variant of RCU): what is happening is that the rcu_dereference() is being used in a workqueue item. The role of rcu_read_lock() is taken on by the start of execution of the workqueue item, of rcu_read_unlock() by the end of execution of the workqueue item, and of synchronize_rcu() by flush_workqueue()/flush_work(). In the future we might need to apply some gcc attribute or sparse annotation to the function passed to INIT_WORK(). Paul's ack below is for this RCU usage. (Includes fixes by Alan Cox , David L Stevens , Chris Wright ) Acked-by: Rusty Russell Acked-by: Arnd Bergmann Acked-by: "Paul E. McKenney" Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4cd498332466..3c4d0109ad20 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -65,6 +65,7 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/lguest/Kconfig source drivers/virtio/Kconfig -- cgit v1.2.2 From 7a1110e861b2666ac09f5708d6fbe71d18ce64bb Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 12 Jan 2010 15:09:04 -0600 Subject: x86, uv: Add function retrieving node controller revision number Add function for determining the revision id of the SGI UV node controller chip (HUB). This function is needed in a subsequent patch. Signed-off-by: Jack Steiner LKML-Reference: <20100112210904.GA24546@sgi.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/uv_hub.h | 12 ++++++++++++ arch/x86/kernel/apic/x2apic_uv_x.c | 6 ++++++ 2 files changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index bc54fa965af3..40be813fefb1 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -495,5 +495,17 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } +/* + * Get the minimum revision number of the hub chips within the partition. + * 1 - initial rev 1.0 silicon + * 2 - rev 2.0 production silicon + */ +static inline int uv_get_min_hub_revision_id(void) +{ + extern int uv_min_hub_revision_id; + + return uv_min_hub_revision_id; +} + #endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_UV_UV_HUB_H */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b8bb869a6618..0e48de9ff864 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -36,6 +36,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +int uv_min_hub_revision_id; +EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); static inline bool is_GRU_range(u64 start, u64 end) { @@ -55,6 +57,10 @@ static int early_get_nodeid(void) mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); node_id.v = *mmr; early_iounmap(mmr, sizeof(*mmr)); + + /* Currently, all blades have same revision number */ + uv_min_hub_revision_id = node_id.s.revision; + return node_id.s.node_id; } -- cgit v1.2.2 From 1d2c867c941d635e53e8ad7bf37d060bb5b25ec5 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 15 Jan 2010 12:09:09 -0600 Subject: x86, uv: Ensure hub revision set for all ACPI modes. Ensure that UV hub revision is set for all ACPI modes. Signed-off-by: Russ Anderson LKML-Reference: <20100115180908.GB7757@sgi.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/x2apic_uv_x.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0e48de9ff864..21db3cbea7dc 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -66,7 +66,10 @@ static int early_get_nodeid(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { + int nodeid; + if (!strcmp(oem_id, "SGI")) { + nodeid = early_get_nodeid(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; @@ -74,7 +77,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); + nodeid << (UV_APIC_PNODE_SHIFT - 1); uv_system_type = UV_NON_UNIQUE_APIC; return 1; } -- cgit v1.2.2 From 0bb7a95f5455cd87e6a69e5818bc1f509a98d187 Mon Sep 17 00:00:00 2001 From: Luca Barbieri Date: Sat, 16 Jan 2010 10:39:30 +0100 Subject: hw-breakpoints, perf: Fix broken mmiotrace due to dr6 by reference change Commit 62edab9056a6cf0c9207339c8892c923a5217e45 (from June 2009 but merged in 2.6.33) changes notify_die to pass dr6 by reference. However, it forgets to fix the check for DR_STEP in kmmio.c, breaking mmiotrace. It also passes a wrong value to the post handler. This simple fix makes mmiotrace work again. Signed-off-by: Luca Barbieri Acked-by: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <1263634770-14578-1-git-send-email-luca@luca-barbieri.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c0f6198565eb..536fb6823366 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -538,14 +538,15 @@ static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) { struct die_args *arg = args; + unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err); - if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) { + if (val == DIE_DEBUG && (*dr6_p & DR_STEP)) + if (post_kmmio_handler(*dr6_p, arg->regs) == 1) { /* * Reset the BS bit in dr6 (pointed by args->err) to * denote completion of processing */ - (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; + *dr6_p &= ~DR_STEP; return NOTIFY_STOP; } -- cgit v1.2.2 From 00097c4fdf117d9845d772f571a987ae95523f8c Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Sun, 17 Jan 2010 19:44:44 -0200 Subject: x86, trivial: Fix grammo in tsc comment about Geode TSC reliability Signed-off-by: Thadeu Lima de Souza Cascardo Cc: marcelo@kvack.org Cc: dilinger@collabora.co.uk Cc: trivial@kernel.org LKML-Reference: <1263764685-9871-1-git-send-email-cascardo@holoscopio.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba0..23066ecf12fa 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void) unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ + /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; #endif -- cgit v1.2.2 From 722b3654852e48b93367a63f8ada9ee1cd43f2d3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 13 Jan 2010 16:19:10 -0800 Subject: x86, vmi: Fix vmi_get_timer_vector() to use IRQ0_VECTOR FIRST_DEVICE_VECTOR is going away and it looks like a bad hack to steal FIRST_DEVICE_VECTOR / FIRST_EXTERNAL_VECTOR, when it looks like it needs IRQ0_VECTOR. Fix vmi_get_timer_vector() to use IRQ0_VECTOR. Signed-off-by: Suresh Siddha LKML-Reference: <20100114002118.436172066@sbs-t61.sc.intel.com> Cc: Alok N Kataria Cc: Zach Amsden Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmiclock_32.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 74c92bb194df..1268d993e9ca 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void) static inline unsigned int vmi_get_timer_vector(void) { -#ifdef CONFIG_X86_IO_APIC - return FIRST_DEVICE_VECTOR; -#else - return FIRST_EXTERNAL_VECTOR; -#endif + return IRQ0_VECTOR; } /** vmi clockchip */ @@ -239,8 +235,6 @@ void __init vmi_time_init(void) vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.2 From 6579b474572fd54c583ac074e8e7aaae926c62ef Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 13 Jan 2010 16:19:11 -0800 Subject: x86, irq: Use 0x20 for the IRQ_MOVE_CLEANUP_VECTOR instead of 0x1f After talking to some more folks inside intel (Peter Anvin, Asit Mallick), the safest option (for future compatibility etc) seen was to use vector 0x20 for IRQ_MOVE_CLEANUP_VECTOR instead of using vector 0x1f (which is documented as reserved vector in the Intel IA32 manuals). Also we don't need to reserve the entire privilege level (all 16 vectors in the priority bucket that IRQ_MOVE_CLEANUP_VECTOR falls into), as the x86 architecture (section 10.9.3 in SDM Vol3a) specifies that with in the priority level, the higher the vector number the higher the priority. And hence we don't need to reserve the complete priority level 0x20-0x2f for the IRQ migration cleanup logic. So change the IRQ_MOVE_CLEANUP_VECTOR to 0x20 and allow 0x21-0x2f to be used for device interrupts. 0x30-0x3f will be used for ISA interrupts (these also can be migrated in the context of IOAPIC and hence need to be at a higher priority level than IRQ_MOVE_CLEANUP_VECTOR). Signed-off-by: Suresh Siddha LKML-Reference: <20100114002118.521826763@sbs-t61.sc.intel.com> Cc: Yinghai Lu Cc: Eric W. Biederman Cc: Maciej W. Rozycki Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_vectors.h | 47 +++++++++++++------------------------- arch/x86/kernel/apic/io_apic.c | 4 ++-- 2 files changed, 18 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 585a42810cf8..8767d99c4f64 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -28,19 +28,22 @@ #define MCE_VECTOR 0x12 /* - * IDT vectors usable for external interrupt sources start - * at 0x20: - * hpa said we can start from 0x1f. - * 0x1f is documented as reserved. However, the ability for the APIC - * to generate vectors starting at 0x10 is documented, as is the - * ability for the CPU to receive any vector number as an interrupt. - * 0x1f is used for IRQ_MOVE_CLEANUP_VECTOR since that vector needs - * an entire privilege level (16 vectors) all by itself at a higher - * priority than any actual device vector. Thus, by placing it in the - * otherwise-unusable 0x10 privilege level, we avoid wasting a full - * 16-vector block. + * IDT vectors usable for external interrupt sources start at 0x20. + * (0x80 is the syscall vector, 0x30-0x3f are for ISA) */ -#define FIRST_EXTERNAL_VECTOR 0x1f +#define FIRST_EXTERNAL_VECTOR 0x20 +/* + * We start allocating at 0x21 to spread out vectors evenly between + * priority levels. (0x80 is the syscall vector) + */ +#define VECTOR_OFFSET_START 1 + +/* + * Reserve the lowest usable vector (and hence lowest priority) 0x20 for + * triggering cleanup after irq migration. 0x21-0x2f will still be used + * for device interrupts. + */ +#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR #define IA32_SYSCALL_VECTOR 0x80 #ifdef CONFIG_X86_32 @@ -48,17 +51,7 @@ #endif /* - * Reserve the lowest usable priority level 0x10 - 0x1f for triggering - * cleanup after irq migration. - * this overlaps with the reserved range for cpu exceptions so this - * will need to be changed to 0x20 - 0x2f if the last cpu exception is - * ever allocated. - */ - -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. + * Vectors 0x30-0x3f are used for ISA interrupts. * round up to the next 16-vector boundary */ #define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) @@ -132,14 +125,6 @@ */ #define MCE_SELF_VECTOR 0xeb -/* - * First APIC vector available to drivers: (vectors 0x30-0xee). We - * start allocating at 0x31 to spread out vectors evenly between - * priority levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 1) -#define VECTOR_OFFSET_START 1 - #define NR_VECTORS 256 #define FPU_IRQ 13 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e9ba0903e9d5..409f4943dc1a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1162,7 +1162,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR + VECTOR_OFFSET_START; + static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 8; unsigned int old_vector; int cpu, err; @@ -1199,7 +1199,7 @@ next: if (vector >= first_system_vector) { /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; + vector = FIRST_EXTERNAL_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; -- cgit v1.2.2 From 1838ef1d782f7527e6defe87e180598622d2d071 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Jan 2010 14:00:34 -0800 Subject: x86-64, rwsem: 64-bit xadd rwsem implementation For x86-64, 32767 threads really is not enough. Change rwsem_count_t to a signed long, so that it is 64 bits on x86-64. This required the following changes to the assembly code: a) %z0 doesn't work on all versions of gcc! At least gcc 4.4.2 as shipped with Fedora 12 emits "ll" not "q" for 64 bits, even for integer operands. Newer gccs apparently do this correctly, but avoid this problem by using the _ASM_ macros instead of %z. b) 64 bits immediates are only allowed in "movq $imm,%reg" constructs... no others. Change some of the constraints to "e", and fix the one case where we would have had to use an invalid immediate -- in that case, we only care about the upper half anyway, so just access the upper half. Signed-off-by: H. Peter Anvin Cc: Linus Torvalds LKML-Reference: --- arch/x86/include/asm/rwsem.h | 53 +++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 5f9af3081d66..10204a25bf93 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -41,6 +41,7 @@ #include #include #include +#include struct rwsem_waiter; @@ -56,18 +57,24 @@ extern asmregparm struct rw_semaphore * /* * the semaphore definition * - * The bias values and the counter type needs to be extended to 64 bits - * if we want to have more than 32767 potential readers/writers + * The bias values and the counter type limits the number of + * potential readers/writers to 32767 for 32 bits and 2147483647 + * for 64 bits. */ -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) +#ifdef CONFIG_X86_64 +# define RWSEM_ACTIVE_MASK 0xffffffffL +#else +# define RWSEM_ACTIVE_MASK 0x0000ffffL +#endif + +#define RWSEM_UNLOCKED_VALUE 0x00000000L +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -typedef signed int rwsem_count_t; +typedef signed long rwsem_count_t; struct rw_semaphore { rwsem_count_t count; @@ -110,7 +117,7 @@ do { \ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" - LOCK_PREFIX " inc%z0 (%1)\n\t" + LOCK_PREFIX _ASM_INC "(%1)\n\t" /* adds 0x00000001, returns the old value */ " jns 1f\n" " call call_rwsem_down_read_failed\n" @@ -225,8 +232,25 @@ static inline void __up_write(struct rw_semaphore *sem) */ static inline void __downgrade_write(struct rw_semaphore *sem) { +#ifdef CONFIG_X86_64 +# if RWSEM_WAITING_BIAS != -0x100000000 +# error "This code assumes RWSEM_WAITING_BIAS == -2^32" +# endif + + /* 64-bit immediates are special and expensive, and not needed here */ + asm volatile("# beginning __downgrade_write\n\t" + LOCK_PREFIX "incl 4(%1)\n\t" + /* transitions 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 */ + " jns 1f\n\t" + " call call_rwsem_downgrade_wake\n" + "1:\n\t" + "# ending __downgrade_write\n" + : "+m" (sem->count) + : "a" (sem) + : "memory", "cc"); +#else asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX " add%z0 %2,(%1)\n\t" + LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" @@ -235,22 +259,25 @@ static inline void __downgrade_write(struct rw_semaphore *sem) : "+m" (sem->count) : "a" (sem), "i" (-RWSEM_WAITING_BIAS) : "memory", "cc"); +#endif } /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(rwsem_count_t delta, + struct rw_semaphore *sem) { - asm volatile(LOCK_PREFIX "add%z0 %1,%0" + asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) - : "ir" (delta)); + : "er" (delta)); } /* * implement exchange and add functionality */ -static inline rwsem_count_t rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, + struct rw_semaphore *sem) { rwsem_count_t tmp = delta; -- cgit v1.2.2 From dfea91d5a7c795fd6f4e1a97489a98e4e767463e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 18 Jan 2010 12:10:48 -0800 Subject: x86, apic: use physical mode for IBM summit platforms Chris McDermott from IBM confirmed that hurricane chipset in IBM summit platforms doesn't support logical flat mode. Irrespective of the other things like apic_id's, total number of logical cpu's, Linux kernel should default to physical mode for this system. The 32-bit kernel does so using the OEM checks for the IBM summit platform. Add a similar OEM platform check for the 64bit kernel too. Otherwise the linux kernel boot can hang on this platform under certain bios/platform settings. Signed-off-by: Suresh Siddha Tested-by: Ananth N Mavinakayanahalli Cc: Chris McDermott Cc: Yinghai Lu Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/apic_flat_64.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index eacbd2b31d27..e3c3d820c325 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } #endif return 0; -- cgit v1.2.2 From bb668da6d6f2bec8a63838c098d9515eccb22cc4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 18 Jan 2010 12:10:49 -0800 Subject: x86, apic: use logical flat for systems with <= 8 logical cpus We can use logical flat mode if there are <= 8 logical cpu's (irrespective of physical apic id values). This will enable simplified and efficient IPI and device interrupt routing on such platforms. This has been tested to work on both Intel and AMD platforms. Exceptions like IBM summit platform which can't use logical flat mode are addressed by using OEM platform checks. Signed-off-by: Suresh Siddha Signed-off-by: Yinghai Lu Cc: Ananth N Mavinakayanahalli Cc: Chris McDermott Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/apic.c | 15 +-------------- arch/x86/kernel/apic/probe_64.c | 8 +++----- 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e80f291472a4..3987e4408f75 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. - * - * This determines the messaging protocol we can use: if all APIC IDs - * are in the 0 ... 7 range, then we can use logical addressing which - * has some performance advantages (better broadcasting). - * - * If there's an APIC ID above 8, we use physical addressing. */ unsigned int max_physical_apicid; @@ -1898,14 +1892,7 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj - */ - if (max_physical_apicid >= 8) { + if (num_processors > 8) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: if (!APIC_XAPIC(version)) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 65edc180fc82..450fe2064a14 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,15 +64,13 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif - if (apic == &apic_flat) { - if (max_physical_apicid >= 8) + if (apic == &apic_flat && num_processors > 8) apic = &apic_physflat; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); - } + + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); if (is_vsmp_box()) { /* need to update phys_pkg_id */ -- cgit v1.2.2 From 97943390b043bcafca69f9163b86bbf627b75589 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 19 Jan 2010 12:20:54 -0800 Subject: x86, irq: Don't block IRQ0_VECTOR..IRQ15_VECTOR's on all cpu's Currently IRQ0..IRQ15 are assigned to IRQ0_VECTOR..IRQ15_VECTOR's on all the cpu's. If these IRQ's are handled by legacy pic controller, then the kernel handles them only on cpu 0. So there is no need to block this vector space on all cpu's. Similarly if these IRQ's are handled by IO-APIC, then the IRQ affinity will determine on which cpu's we need allocate the vector resource for that particular IRQ. This can be done dynamically and here also there is no need to block 16 vectors for IRQ0..IRQ15 on all cpu's. Fix this by initially assigning IRQ0..IRQ15 to IRQ0_VECTOR..IRQ15_VECTOR's only on cpu 0. If the legacy controllers like pic handles these irq's, then this configuration will be fixed. If more modern controllers like IO-APIC handle these IRQ's, then we start with this configuration and as IRQ's migrate, vectors (/and cpu's) associated with these IRQ's change dynamically. This will freeup the block of 16 vectors on other cpu's which don't handle IRQ0..IRQ15, which can now be used for other IRQ's that the particular cpu handle. [ hpa: this also an architectural cleanup for future legacy-PIC-free configurations. ] [ hpa: fixed typo NR_LEGACY_IRQS -> NR_IRQS_LEGACY ] Signed-off-by: Suresh Siddha LKML-Reference: <1263932453.2814.52.camel@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq.h | 1 + arch/x86/kernel/apic/io_apic.c | 33 ++++++++++----------------------- arch/x86/kernel/irqinit.c | 35 +++++++++++++++++------------------ arch/x86/kernel/vmiclock_32.c | 2 ++ 4 files changed, 30 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 5458380b6ef8..262292729fc4 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -48,5 +48,6 @@ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); +extern int nr_legacy_irqs; #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 409f4943dc1a..1a30587a6bc2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -94,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* Number of legacy interrupts */ -static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; @@ -140,27 +138,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg irq_cfgx[] = { +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; #else -static struct irq_cfg irq_cfgx[NR_IRQS] = { +static struct irq_cfg irq_cfgx[NR_IRQS]; #endif - [0] = { .vector = IRQ0_VECTOR, }, - [1] = { .vector = IRQ1_VECTOR, }, - [2] = { .vector = IRQ2_VECTOR, }, - [3] = { .vector = IRQ3_VECTOR, }, - [4] = { .vector = IRQ4_VECTOR, }, - [5] = { .vector = IRQ5_VECTOR, }, - [6] = { .vector = IRQ6_VECTOR, }, - [7] = { .vector = IRQ7_VECTOR, }, - [8] = { .vector = IRQ8_VECTOR, }, - [9] = { .vector = IRQ9_VECTOR, }, - [10] = { .vector = IRQ10_VECTOR, }, - [11] = { .vector = IRQ11_VECTOR, }, - [12] = { .vector = IRQ12_VECTOR, }, - [13] = { .vector = IRQ13_VECTOR, }, - [14] = { .vector = IRQ14_VECTOR, }, - [15] = { .vector = IRQ15_VECTOR, }, -}; void __init io_apic_disable_legacy(void) { @@ -185,8 +166,14 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < nr_legacy_irqs) - cpumask_setall(cfg[i].domain); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + */ + if (i < nr_legacy_irqs) { + cfg[i].vector = IRQ0_VECTOR + i; + cpumask_set_cpu(0, cfg[i].domain); + } } return 0; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d5932226614f..fce55d532631 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -84,24 +84,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 + [0 ... NR_VECTORS - 1] = -1, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -116,6 +99,9 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } +/* Number of legacy interrupts */ +int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; + void __init init_ISA_irqs(void) { int i; @@ -142,6 +128,19 @@ void __init init_ISA_irqs(void) void __init init_IRQ(void) { + int i; + + /* + * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * If these IRQ's are handled by legacy interrupt-controllers like PIC, + * then this configuration will likely be static after the boot. If + * these IRQ's are handled by more mordern controllers like IO-APIC, + * then this vector space can be freed and re-used dynamically as the + * irq's migrate etc. + */ + for (i = 0; i < nr_legacy_irqs; i++) + per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + x86_init.irqs.intr_init(); } diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 1268d993e9ca..2f1ca5614292 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -235,6 +235,8 @@ void __init vmi_time_init(void) vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.2 From b27d515a49169e5e2a92d621faac761074a8c5b1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 18 Jan 2010 10:58:01 +0200 Subject: perf: x86: Add support for the ANY bit Propagate the ANY bit into the fixed counter config for v3 and higher. Signed-off-by: Stephane Eranian [a.p.zijlstra@chello.nl: split from larger patch] Signed-off-by: Peter Zijlstra LKML-Reference: <4b5430c6.0f975e0a.1bf9.ffff85fe@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8d9f8548a870..1380367dabd9 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -19,6 +19,7 @@ #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 #define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) #define ARCH_PERFMON_EVENTSEL_INT (1 << 20) #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) #define ARCH_PERFMON_EVENTSEL_USR (1 << 16) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d616c06e99b4..8c1c07073ccc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1343,6 +1343,13 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + bits <<= (idx * 4); mask = 0xfULL << (idx * 4); -- cgit v1.2.2 From d91afd15b041f27d34859c79afa9e172018a86f4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jan 2010 16:40:20 +0100 Subject: x86/amd-iommu: Fix possible integer overflow The variable i in this function could be increased to over 2**32 which would result in an integer overflow when using int. Fix it by changing i to unsigned long. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 23824fef789c..c2ccbd7b862f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -980,7 +980,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; struct amd_iommu *iommu; - int i; + unsigned long i; #ifdef CONFIG_IOMMU_STRESS populate = false; -- cgit v1.2.2 From 2ca762790caf822f7b61430fbaffa3ae4219977f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jan 2010 16:45:31 +0100 Subject: x86/amd-iommu: Fix NULL pointer dereference in __detach_device() In the __detach_device function the reference count for a device-domain binding may become zero. This results in the device being removed from the domain and dev_data->domain will be NULL. This is bad because this pointer is dereferenced when trying to unlock the domain->lock. This patch fixes the issue by keeping the domain in a seperate variable. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c2ccbd7b862f..4478a48198a8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1489,11 +1489,14 @@ static void __detach_device(struct device *dev) { struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; + struct protection_domain *domain; unsigned long flags; BUG_ON(!dev_data->domain); - spin_lock_irqsave(&dev_data->domain->lock, flags); + domain = dev_data->domain; + + spin_lock_irqsave(&domain->lock, flags); if (dev_data->alias != dev) { alias_data = get_dev_data(dev_data->alias); @@ -1504,7 +1507,7 @@ static void __detach_device(struct device *dev) if (atomic_dec_and_test(&dev_data->bind)) do_detach(dev); - spin_unlock_irqrestore(&dev_data->domain->lock, flags); + spin_unlock_irqrestore(&domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the -- cgit v1.2.2 From f5325094379158e6b876ea0010c807bf7890ec8f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jan 2010 17:44:35 +0100 Subject: x86/amd-iommu: Fix IOMMU-API initialization for iommu=pt This patch moves the initialization of the iommu-api out of the dma-ops initialization code. This ensures that the iommu-api is initialized even with iommu=pt. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_proto.h | 1 + arch/x86/kernel/amd_iommu.c | 8 ++++++-- arch/x86/kernel/amd_iommu_init.c | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 4d817f9e6e77..d2544f1d705d 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -31,6 +31,7 @@ extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); extern int amd_iommu_init_devices(void); extern void amd_iommu_uninit_devices(void); extern void amd_iommu_init_notifier(void); +extern void amd_iommu_init_api(void); #ifndef CONFIG_AMD_IOMMU_STATS static inline void amd_iommu_stats_init(void) { } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 4478a48198a8..751ce73c6e1b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2221,6 +2221,12 @@ static struct dma_map_ops amd_iommu_dma_ops = { /* * The function which clues the AMD IOMMU driver into dma_ops. */ + +void __init amd_iommu_init_api(void) +{ + register_iommu(&amd_iommu_ops); +} + int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; @@ -2256,8 +2262,6 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; - register_iommu(&amd_iommu_ops); - amd_iommu_stats_init(); return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fb490ce7dd55..9dc91b431470 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1292,9 +1292,12 @@ static int __init amd_iommu_init(void) ret = amd_iommu_init_passthrough(); else ret = amd_iommu_init_dma_ops(); + if (ret) goto free; + amd_iommu_init_api(); + amd_iommu_init_notifier(); enable_iommus(); -- cgit v1.2.2 From d3ad9373b7c29b63d5e8460a69453718d200cc3b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jan 2010 17:55:27 +0100 Subject: x86/amd-iommu: Fix deassignment of a device from the pt_domain Deassigning a device from the passthrough domain does not work and breaks device assignment to kvm guests. This patch fixes the issue. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 751ce73c6e1b..adb0ba025702 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1511,9 +1511,11 @@ static void __detach_device(struct device *dev) /* * If we run in passthrough mode the device must be assigned to the - * passthrough domain if it is detached from any other domain + * passthrough domain if it is detached from any other domain. + * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through && dev_data->domain == NULL) + if (iommu_pass_through && + (dev_data->domain == NULL && domain != pt_domain)) __attach_device(dev, pt_domain); } -- cgit v1.2.2 From a7b480e7f30b3813353ec009f10f2ac7a6669f3b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 22 Jan 2010 16:01:03 +0100 Subject: x86, lib: Add wbinvd smp helpers Add wbinvd_on_cpu and wbinvd_on_all_cpus stubs for executing wbinvd on a particular CPU. [ hpa: renamed lib/smp.c to lib/cache-smp.c ] [ hpa: wbinvd_on_all_cpus() returns int, but wbinvd() returns void. Thus, the former cannot be a macro for the latter, replace with an inline function. ] Signed-off-by: Borislav Petkov LKML-Reference: <1264172467-25155-2-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/smp.h | 9 +++++++++ arch/x86/lib/Makefile | 2 +- arch/x86/lib/cache-smp.c | 19 +++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 arch/x86/lib/cache-smp.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 1e796782cd7b..4cfc90824068 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -135,6 +135,8 @@ int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); +void wbinvd_on_cpu(int cpu); +int wbinvd_on_all_cpus(void); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); @@ -147,6 +149,13 @@ static inline int num_booting_cpus(void) { return cpumask_weight(cpu_callout_mask); } +#else /* !CONFIG_SMP */ +#define wbinvd_on_cpu(cpu) wbinvd() +static inline int wbinvd_on_all_cpus(void) +{ + wbinvd(); + return 0; +} #endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index cffd754f3039..d85e0e438b58 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -14,7 +14,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c clean-files := inat-tables.c -obj-$(CONFIG_SMP) += msr-smp.o +obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c new file mode 100644 index 000000000000..a3c668875038 --- /dev/null +++ b/arch/x86/lib/cache-smp.c @@ -0,0 +1,19 @@ +#include +#include + +static void __wbinvd(void *dummy) +{ + wbinvd(); +} + +void wbinvd_on_cpu(int cpu) +{ + smp_call_function_single(cpu, __wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_cpu); + +int wbinvd_on_all_cpus(void) +{ + return on_each_cpu(__wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_all_cpus); -- cgit v1.2.2 From dcf39daf3d6d97f8741e82f0b9fb7554704ed2d1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 22 Jan 2010 16:01:05 +0100 Subject: x86, cacheinfo: Fix disabling of L3 cache indices * Correct the masks used for writing the cache index disable indices. * Do not turn off L3 scrubber - it is not necessary. * Make sure wbinvd is executed on the same node where the L3 is. * Check for out-of-bounds values written to the registers. * Make show_cache_disable hex values unambiguous * Check for Erratum #388 Signed-off-by: Borislav Petkov LKML-Reference: <1264172467-25155-4-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc6c8ef92dcc..08c91abc4d32 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -18,6 +18,7 @@ #include #include #include +#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -299,8 +300,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) if (boot_cpu_data.x86 == 0x11) return; - /* see erratum #382 */ - if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + /* see errata #382 and #388 */ + if ((boot_cpu_data.x86 == 0x10) && + ((boot_cpu_data.x86_model < 0x9) || + (boot_cpu_data.x86_mask < 0x1))) return; this_leaf->can_disable = 1; @@ -726,12 +729,12 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, return -EINVAL; pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "%x\n", reg); + return sprintf(buf, "0x%08x\n", reg); } #define SHOW_CACHE_DISABLE(index) \ static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ { \ return show_cache_disable(this_leaf, buf, index); \ } @@ -745,7 +748,9 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; - unsigned int scrubber = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff if (!this_leaf->can_disable) return -EINVAL; @@ -759,21 +764,24 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - val |= 0xc0000000; - - pci_read_config_dword(dev, 0x58, &scrubber); - scrubber &= ~0x1f000000; - pci_write_config_dword(dev, 0x58, scrubber); + /* do not allow writes outside of allowed bits */ + if (val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) + return -EINVAL; - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); - wbinvd(); + val |= BIT(30); pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); return count; } #define STORE_CACHE_DISABLE(index) \ static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ const char *buf, size_t count) \ { \ return store_cache_disable(this_leaf, buf, count, index); \ -- cgit v1.2.2 From 897de50e08937663912c86fb12ad7f708af2386c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 22 Jan 2010 16:01:06 +0100 Subject: x86, cacheinfo: Add cache index disable sysfs attrs only to L3 caches The cache_disable_[01] attribute in /sys/devices/system/cpu/cpu?/cache/index[0-3]/ is enabled on all cache levels although only L3 supports it. Add it only to the cache level that actually supports it. Signed-off-by: Borislav Petkov LKML-Reference: <1264172467-25155-5-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 08c91abc4d32..3976ce95095f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -814,16 +814,24 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); +#define DEFAULT_SYSFS_CACHE_ATTRS \ + &type.attr, \ + &level.attr, \ + &coherency_line_size.attr, \ + &physical_line_partition.attr, \ + &ways_of_associativity.attr, \ + &number_of_sets.attr, \ + &size.attr, \ + &shared_cpu_map.attr, \ + &shared_cpu_list.attr + static struct attribute *default_attrs[] = { - &type.attr, - &level.attr, - &coherency_line_size.attr, - &physical_line_partition.attr, - &ways_of_associativity.attr, - &number_of_sets.attr, - &size.attr, - &shared_cpu_map.attr, - &shared_cpu_list.attr, + DEFAULT_SYSFS_CACHE_ATTRS, + NULL +}; + +static struct attribute *default_l3_attrs[] = { + DEFAULT_SYSFS_CACHE_ATTRS, &cache_disable_0.attr, &cache_disable_1.attr, NULL @@ -916,6 +924,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; + struct _cpuid4_info *this_leaf; int retval; retval = cpuid4_cache_sysfs_init(cpu); @@ -934,6 +943,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; + + this_leaf = CPUID4_INFO_IDX(cpu, i); + + if (this_leaf->can_disable) + ktype_cache.default_attrs = default_l3_attrs; + else + ktype_cache.default_attrs = default_attrs; + retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), -- cgit v1.2.2 From 048a8774ca43488d78605031f11cc206d7a2682a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 22 Jan 2010 16:01:07 +0100 Subject: x86, cacheinfo: Calculate L3 indices We need to know the valid L3 indices interval when disabling them over /sysfs. Do that when the core is brought online and add boundary checks to the sysfs .store attribute. Signed-off-by: Borislav Petkov LKML-Reference: <1264172467-25155-6-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 3976ce95095f..589b705e80ed 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -151,7 +151,8 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; @@ -161,7 +162,8 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; }; unsigned short num_cache_leaves; @@ -291,6 +293,29 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +static unsigned int __cpuinit amd_calc_l3_indices(void) +{ + /* + * We're called over smp_call_function_single() and therefore + * are on the correct cpu. + */ + int cpu = smp_processor_id(); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int sc0, sc1, sc2, sc3; + u32 val; + + pci_read_config_dword(dev, 0x1C4, &val); + + /* calculate subcache sizes */ + sc0 = !(val & BIT(0)); + sc1 = !(val & BIT(4)); + sc2 = !(val & BIT(8)) + !(val & BIT(9)); + sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; +} + static void __cpuinit amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { @@ -306,7 +331,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) (boot_cpu_data.x86_mask < 0x1))) return; - this_leaf->can_disable = 1; + this_leaf->can_disable = true; + this_leaf->l3_indices = amd_calc_l3_indices(); } static int @@ -765,7 +791,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, return -EINVAL; /* do not allow writes outside of allowed bits */ - if (val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) return -EINVAL; val |= BIT(30); -- cgit v1.2.2 From 3a5fc0e40cb467e692737bc798bc99773c81e1e2 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 20 Jan 2010 12:10:47 -0800 Subject: x86: Set hotpluggable nodes in nodes_possible_map nodes_possible_map does not currently include nodes that have SRAT entries that are all ACPI_SRAT_MEM_HOT_PLUGGABLE since the bit is cleared in nodes_parsed if it does not have an online address range. Unequivocally setting the bit in nodes_parsed is insufficient since existing code, such as acpi_get_nodes(), assumes all nodes in the map have online address ranges. In fact, all code using nodes_parsed assumes such nodes represent an address range of online memory. nodes_possible_map is created by unioning nodes_parsed and cpu_nodes_parsed; the former represents nodes with online memory and the latter represents memoryless nodes. We now set the bit for hotpluggable nodes in cpu_nodes_parsed so that it also gets set in nodes_possible_map. [ hpa: Haicheng Li points out that this makes the naming of the variable cpu_nodes_parsed somewhat counterintuitive. However, leave it as is in the interest of keeping the pure bug fix patch small. ] Signed-off-by: David Rientjes Tested-by: Haicheng Li LKML-Reference: Cc: Signed-off-by: H. Peter Anvin --- arch/x86/mm/srat_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a27124185fc1..28c68762648f 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -229,9 +229,11 @@ update_nodes_add(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - if (changed) + if (changed) { + node_set(node, cpu_nodes_parsed); printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + } } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -- cgit v1.2.2 From 73472a46b5b28116b145fb5fc05242c1aa8e1461 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 21 Jan 2010 11:09:52 -0800 Subject: x86: Disable HPET MSI on ATI SB700/SB800 HPET MSI on platforms with ATI SB700/SB800 as they seem to have some side-effects on floppy DMA. Do not use HPET MSI on such platforms. Original problem report from Mark Hounschell http://lkml.indiana.edu/hypermail/linux/kernel/0912.2/01118.html [ This patch needs to go to stable as well. But, there are some conflicts that prevents the patch from going as is. I can rebase/resubmit to stable once the patch goes upstream. hpa: still Cc:'ing stable@ as an FYI. ] Tested-by: Mark Hounschell Signed-off-by: Venkatesh Pallipadi Cc: LKML-Reference: <20100121190952.GA32523@linux-os.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hpet.h | 1 + arch/x86/kernel/hpet.c | 8 ++++++++ arch/x86/kernel/quirks.c | 13 +++++++++++++ 3 files changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 5d89fd2a3690..1d5c08a1bdfd 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -67,6 +67,7 @@ extern unsigned long hpet_address; extern unsigned long force_hpet_address; extern u8 hpet_blockid; extern int hpet_force_user; +extern u8 hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba6e65884603..ad80a1c718c6 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -34,6 +34,8 @@ */ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ +u8 hpet_msi_disable; + #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -596,6 +598,9 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (hpet_msi_disable) + return; + if (boot_cpu_has(X86_FEATURE_ARAT)) return; id = hpet_readl(HPET_ID); @@ -928,6 +933,9 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (hpet_msi_disable) + return 0; + if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 18093d7498f0..12e9feaa2f7a 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,6 +491,19 @@ void force_hpet_resume(void) break; } } + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + #endif #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) -- cgit v1.2.2 From 3b2e3d85aeb80769fb96c15ee4f6e14135328471 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 22 Jan 2010 21:34:56 +0100 Subject: Revert "x86: ucode-amd: Load ucode-patches once ..." Commit d1c84f79a6ba992dc01e312c44a21496303874d6 leads to a regression when microcode_amd.c is compiled into the kernel. It causes a big boot delay because the firmware is not available. See http://marc.info/?l=linux-kernel&m=126267290920060 It also renders the reload sysfs attribute useless. Fixing this is too intrusive for an -rc5 kernel. Thus I'd like to restore the microcode loading behaviour of kernel 2.6.32. CC: Gene Heskett Signed-off-by: Andreas Herrmann LKML-Reference: <20100122203456.GB13792@alberich.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/microcode.h | 2 -- arch/x86/kernel/microcode_amd.c | 44 ++++++++++++---------------------------- arch/x86/kernel/microcode_core.c | 6 ------ 3 files changed, 13 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index c24ca9a56458..ef51b501e22a 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -12,8 +12,6 @@ struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; struct microcode_ops { - void (*init)(struct device *device); - void (*fini)(void); enum ucode_state (*request_microcode_user) (int cpu, const void __user *buf, size_t size); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 37542b67c57e..e1af7c055c7d 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -36,9 +36,6 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 -const struct firmware *firmware; -static int supported_cpu; - struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -77,12 +74,15 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { + struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - if (!supported_cpu) - return -1; - memset(csig, 0, sizeof(*csig)); + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); + return -1; + } rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; @@ -294,10 +294,14 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; enum ucode_state ret; - if (firmware == NULL) + if (request_firmware(&firmware, fw_name, device)) { + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return UCODE_NFOUND; + } if (*(u32 *)firmware->data != UCODE_MAGIC) { pr_err("invalid UCODE_MAGIC (0x%08x)\n", @@ -307,6 +311,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) ret = generic_load_microcode(cpu, firmware->data, firmware->size); + release_firmware(firmware); + return ret; } @@ -325,31 +331,7 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } -void init_microcode_amd(struct device *device) -{ - const char *fw_name = "amd-ucode/microcode_amd.bin"; - struct cpuinfo_x86 *c = &boot_cpu_data; - - WARN_ON(c->x86_vendor != X86_VENDOR_AMD); - - if (c->x86 < 0x10) { - pr_warning("AMD CPU family 0x%x not supported\n", c->x86); - return; - } - supported_cpu = 1; - - if (request_firmware(&firmware, fw_name, device)) - pr_err("failed to load file %s\n", fw_name); -} - -void fini_microcode_amd(void) -{ - release_firmware(firmware); -} - static struct microcode_ops microcode_amd_ops = { - .init = init_microcode_amd, - .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 0c8632433090..cceb5bc3c3c2 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -521,9 +521,6 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } - if (microcode_ops->init) - microcode_ops->init(µcode_pdev->dev); - get_online_cpus(); mutex_lock(µcode_mutex); @@ -566,9 +563,6 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); - if (microcode_ops->fini) - microcode_ops->fini(); - microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -- cgit v1.2.2 From b160091802d4a76dd063facb09fcf10bf5d5d747 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 23 Jan 2010 18:27:47 -0800 Subject: x86: Remove "x86 CPU features in debugfs" (CONFIG_X86_CPU_DEBUG) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_X86_CPU_DEBUG, which provides some parsed versions of the x86 CPU configuration via debugfs, has caused boot failures on real hardware. The value of this feature has been marginal at best, as all this information is already available to userspace via generic interfaces. Causes crashes that have not been fixed + minimal utility -> remove. See the referenced LKML thread for more information. Reported-by: Ozan ÇaÄŸlayan Signed-off-by: H. Peter Anvin LKML-Reference: Cc: Jaswinder Singh Rajput Cc: Linus Torvalds Cc: Rafael J. Wysocki Cc: Yinghai Lu Cc: --- arch/x86/Kconfig | 6 - arch/x86/include/asm/cpu_debug.h | 127 -------- arch/x86/kernel/cpu/Makefile | 2 - arch/x86/kernel/cpu/cpu_debug.c | 688 --------------------------------------- 4 files changed, 823 deletions(-) delete mode 100644 arch/x86/include/asm/cpu_debug.h delete mode 100644 arch/x86/kernel/cpu/cpu_debug.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cbcbfdee3ee0..eb4092568f9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -989,12 +989,6 @@ config X86_CPUID with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -config X86_CPU_DEBUG - tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support" - ---help--- - If you select this option, this will provide various x86 CPUs - information through debugfs. - choice prompt "High Memory Support" default HIGHMEM4G if !X86_NUMAQ diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h deleted file mode 100644 index d96c1ee3a95c..000000000000 --- a/arch/x86/include/asm/cpu_debug.h +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef _ASM_X86_CPU_DEBUG_H -#define _ASM_X86_CPU_DEBUG_H - -/* - * CPU x86 architecture debug - * - * Copyright(C) 2009 Jaswinder Singh Rajput - */ - -/* Register flags */ -enum cpu_debug_bit { -/* Model Specific Registers (MSRs) */ - CPU_MC_BIT, /* Machine Check */ - CPU_MONITOR_BIT, /* Monitor */ - CPU_TIME_BIT, /* Time */ - CPU_PMC_BIT, /* Performance Monitor */ - CPU_PLATFORM_BIT, /* Platform */ - CPU_APIC_BIT, /* APIC */ - CPU_POWERON_BIT, /* Power-on */ - CPU_CONTROL_BIT, /* Control */ - CPU_FEATURES_BIT, /* Features control */ - CPU_LBRANCH_BIT, /* Last Branch */ - CPU_BIOS_BIT, /* BIOS */ - CPU_FREQ_BIT, /* Frequency */ - CPU_MTTR_BIT, /* MTRR */ - CPU_PERF_BIT, /* Performance */ - CPU_CACHE_BIT, /* Cache */ - CPU_SYSENTER_BIT, /* Sysenter */ - CPU_THERM_BIT, /* Thermal */ - CPU_MISC_BIT, /* Miscellaneous */ - CPU_DEBUG_BIT, /* Debug */ - CPU_PAT_BIT, /* PAT */ - CPU_VMX_BIT, /* VMX */ - CPU_CALL_BIT, /* System Call */ - CPU_BASE_BIT, /* BASE Address */ - CPU_VER_BIT, /* Version ID */ - CPU_CONF_BIT, /* Configuration */ - CPU_SMM_BIT, /* System mgmt mode */ - CPU_SVM_BIT, /*Secure Virtual Machine*/ - CPU_OSVM_BIT, /* OS-Visible Workaround*/ -/* Standard Registers */ - CPU_TSS_BIT, /* Task Stack Segment */ - CPU_CR_BIT, /* Control Registers */ - CPU_DT_BIT, /* Descriptor Table */ -/* End of Registers flags */ - CPU_REG_ALL_BIT, /* Select all Registers */ -}; - -#define CPU_REG_ALL (~0) /* Select all Registers */ - -#define CPU_MC (1 << CPU_MC_BIT) -#define CPU_MONITOR (1 << CPU_MONITOR_BIT) -#define CPU_TIME (1 << CPU_TIME_BIT) -#define CPU_PMC (1 << CPU_PMC_BIT) -#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT) -#define CPU_APIC (1 << CPU_APIC_BIT) -#define CPU_POWERON (1 << CPU_POWERON_BIT) -#define CPU_CONTROL (1 << CPU_CONTROL_BIT) -#define CPU_FEATURES (1 << CPU_FEATURES_BIT) -#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT) -#define CPU_BIOS (1 << CPU_BIOS_BIT) -#define CPU_FREQ (1 << CPU_FREQ_BIT) -#define CPU_MTRR (1 << CPU_MTTR_BIT) -#define CPU_PERF (1 << CPU_PERF_BIT) -#define CPU_CACHE (1 << CPU_CACHE_BIT) -#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT) -#define CPU_THERM (1 << CPU_THERM_BIT) -#define CPU_MISC (1 << CPU_MISC_BIT) -#define CPU_DEBUG (1 << CPU_DEBUG_BIT) -#define CPU_PAT (1 << CPU_PAT_BIT) -#define CPU_VMX (1 << CPU_VMX_BIT) -#define CPU_CALL (1 << CPU_CALL_BIT) -#define CPU_BASE (1 << CPU_BASE_BIT) -#define CPU_VER (1 << CPU_VER_BIT) -#define CPU_CONF (1 << CPU_CONF_BIT) -#define CPU_SMM (1 << CPU_SMM_BIT) -#define CPU_SVM (1 << CPU_SVM_BIT) -#define CPU_OSVM (1 << CPU_OSVM_BIT) -#define CPU_TSS (1 << CPU_TSS_BIT) -#define CPU_CR (1 << CPU_CR_BIT) -#define CPU_DT (1 << CPU_DT_BIT) - -/* Register file flags */ -enum cpu_file_bit { - CPU_INDEX_BIT, /* index */ - CPU_VALUE_BIT, /* value */ -}; - -#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) - -#define MAX_CPU_FILES 512 - -struct cpu_private { - unsigned cpu; - unsigned type; - unsigned reg; - unsigned file; -}; - -struct cpu_debug_base { - char *name; /* Register name */ - unsigned flag; /* Register flag */ - unsigned write; /* Register write flag */ -}; - -/* - * Currently it looks similar to cpu_debug_base but once we add more files - * cpu_file_base will go in different direction - */ -struct cpu_file_base { - char *name; /* Register file name */ - unsigned flag; /* Register file flag */ - unsigned write; /* Register write flag */ -}; - -struct cpu_cpuX_base { - struct dentry *dentry; /* Register dentry */ - int init; /* Register index file */ -}; - -struct cpu_debug_range { - unsigned min; /* Register range min */ - unsigned max; /* Register range max */ - unsigned flag; /* Supported flags */ -}; - -#endif /* _ASM_X86_CPU_DEBUG_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1d2cb383410e..c202b62f3671 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o -obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c deleted file mode 100644 index b368cd862997..000000000000 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * CPU x86 architecture debug code - * - * Copyright(C) 2009 Jaswinder Singh Rajput - * - * For licencing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr); -static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr); -static DEFINE_PER_CPU(int, cpud_priv_count); - -static DEFINE_MUTEX(cpu_debug_lock); - -static struct dentry *cpu_debugfs_dir; - -static struct cpu_debug_base cpu_base[] = { - { "mc", CPU_MC, 0 }, - { "monitor", CPU_MONITOR, 0 }, - { "time", CPU_TIME, 0 }, - { "pmc", CPU_PMC, 1 }, - { "platform", CPU_PLATFORM, 0 }, - { "apic", CPU_APIC, 0 }, - { "poweron", CPU_POWERON, 0 }, - { "control", CPU_CONTROL, 0 }, - { "features", CPU_FEATURES, 0 }, - { "lastbranch", CPU_LBRANCH, 0 }, - { "bios", CPU_BIOS, 0 }, - { "freq", CPU_FREQ, 0 }, - { "mtrr", CPU_MTRR, 0 }, - { "perf", CPU_PERF, 0 }, - { "cache", CPU_CACHE, 0 }, - { "sysenter", CPU_SYSENTER, 0 }, - { "therm", CPU_THERM, 0 }, - { "misc", CPU_MISC, 0 }, - { "debug", CPU_DEBUG, 0 }, - { "pat", CPU_PAT, 0 }, - { "vmx", CPU_VMX, 0 }, - { "call", CPU_CALL, 0 }, - { "base", CPU_BASE, 0 }, - { "ver", CPU_VER, 0 }, - { "conf", CPU_CONF, 0 }, - { "smm", CPU_SMM, 0 }, - { "svm", CPU_SVM, 0 }, - { "osvm", CPU_OSVM, 0 }, - { "tss", CPU_TSS, 0 }, - { "cr", CPU_CR, 0 }, - { "dt", CPU_DT, 0 }, - { "registers", CPU_REG_ALL, 0 }, -}; - -static struct cpu_file_base cpu_file[] = { - { "index", CPU_REG_ALL, 0 }, - { "value", CPU_REG_ALL, 1 }, -}; - -/* CPU Registers Range */ -static struct cpu_debug_range cpu_reg_range[] = { - { 0x00000000, 0x00000001, CPU_MC, }, - { 0x00000006, 0x00000007, CPU_MONITOR, }, - { 0x00000010, 0x00000010, CPU_TIME, }, - { 0x00000011, 0x00000013, CPU_PMC, }, - { 0x00000017, 0x00000017, CPU_PLATFORM, }, - { 0x0000001B, 0x0000001B, CPU_APIC, }, - { 0x0000002A, 0x0000002B, CPU_POWERON, }, - { 0x0000002C, 0x0000002C, CPU_FREQ, }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, }, - { 0x00000040, 0x00000047, CPU_LBRANCH, }, - { 0x00000060, 0x00000067, CPU_LBRANCH, }, - { 0x00000079, 0x00000079, CPU_BIOS, }, - { 0x00000088, 0x0000008A, CPU_CACHE, }, - { 0x0000008B, 0x0000008B, CPU_BIOS, }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, }, - { 0x000000C1, 0x000000C4, CPU_PMC, }, - { 0x000000CD, 0x000000CD, CPU_FREQ, }, - { 0x000000E7, 0x000000E8, CPU_PERF, }, - { 0x000000FE, 0x000000FE, CPU_MTRR, }, - - { 0x00000116, 0x0000011E, CPU_CACHE, }, - { 0x00000174, 0x00000176, CPU_SYSENTER, }, - { 0x00000179, 0x0000017B, CPU_MC, }, - { 0x00000186, 0x00000189, CPU_PMC, }, - { 0x00000198, 0x00000199, CPU_PERF, }, - { 0x0000019A, 0x0000019A, CPU_TIME, }, - { 0x0000019B, 0x0000019D, CPU_THERM, }, - { 0x000001A0, 0x000001A0, CPU_MISC, }, - { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, }, - { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, }, - { 0x00000250, 0x00000250, CPU_MTRR, }, - { 0x00000258, 0x00000259, CPU_MTRR, }, - { 0x00000268, 0x0000026F, CPU_MTRR, }, - { 0x00000277, 0x00000277, CPU_PAT, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, }, - - { 0x00000300, 0x00000311, CPU_PMC, }, - { 0x00000345, 0x00000345, CPU_PMC, }, - { 0x00000360, 0x00000371, CPU_PMC, }, - { 0x0000038D, 0x00000390, CPU_PMC, }, - { 0x000003A0, 0x000003BE, CPU_PMC, }, - { 0x000003C0, 0x000003CD, CPU_PMC, }, - { 0x000003E0, 0x000003E1, CPU_PMC, }, - { 0x000003F0, 0x000003F2, CPU_PMC, }, - - { 0x00000400, 0x00000417, CPU_MC, }, - { 0x00000480, 0x0000048B, CPU_VMX, }, - - { 0x00000600, 0x00000600, CPU_DEBUG, }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, - - { 0x000107CC, 0x000107D3, CPU_PMC, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, }, - { 0xC0000081, 0xC0000084, CPU_CALL, }, - { 0xC0000100, 0xC0000102, CPU_BASE, }, - { 0xC0000103, 0xC0000103, CPU_TIME, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, }, - { 0xC0010010, 0xC0010010, CPU_CONF, }, - { 0xC0010015, 0xC0010015, CPU_CONF, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, }, - { 0xC001001F, 0xC001001F, CPU_CONF, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, }, - { 0xC0010044, 0xC0010048, CPU_MC, }, - { 0xC0010050, 0xC0010056, CPU_SMM, }, - { 0xC0010058, 0xC0010058, CPU_CONF, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, }, - { 0xC0010061, 0xC0010068, CPU_SMM, }, - { 0xC0010069, 0xC001006B, CPU_SMM, }, - { 0xC0010070, 0xC0010071, CPU_SMM, }, - { 0xC0010111, 0xC0010113, CPU_SMM, }, - { 0xC0010114, 0xC0010118, CPU_SVM, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, }, - { 0xC0011022, 0xC0011023, CPU_CONF, }, -}; - -static int is_typeflag_valid(unsigned cpu, unsigned flag) -{ - int i; - - /* Standard Registers should be always valid */ - if (flag >= CPU_TSS) - return 1; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (cpu_reg_range[i].flag == flag) - return 1; - } - - /* Invalid */ - return 0; -} - -static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, - int index, unsigned flag) -{ - if (cpu_reg_range[index].flag == flag) { - *min = cpu_reg_range[index].min; - *max = cpu_reg_range[index].max; - } else - *max = 0; - - return *max; -} - -/* This function can also be called with seq = NULL for printk */ -static void print_cpu_data(struct seq_file *seq, unsigned type, - u32 low, u32 high) -{ - struct cpu_private *priv; - u64 val = high; - - if (seq) { - priv = seq->private; - if (priv->file) { - val = (val << 32) | low; - seq_printf(seq, "0x%llx\n", val); - } else - seq_printf(seq, " %08x: %08x_%08x\n", - type, high, low); - } else - printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); -} - -/* This function can also be called with seq = NULL for printk */ -static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) -{ - unsigned msr, msr_min, msr_max; - struct cpu_private *priv; - u32 low, high; - int i; - - if (seq) { - priv = seq->private; - if (priv->file) { - if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, - &low, &high)) - print_cpu_data(seq, priv->reg, low, high); - return; - } - } - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) - continue; - - for (msr = msr_min; msr <= msr_max; msr++) { - if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) - continue; - print_cpu_data(seq, msr, low, high); - } - } -} - -static void print_tss(void *arg) -{ - struct pt_regs *regs = task_pt_regs(current); - struct seq_file *seq = arg; - unsigned int seg; - - seq_printf(seq, " RAX\t: %016lx\n", regs->ax); - seq_printf(seq, " RBX\t: %016lx\n", regs->bx); - seq_printf(seq, " RCX\t: %016lx\n", regs->cx); - seq_printf(seq, " RDX\t: %016lx\n", regs->dx); - - seq_printf(seq, " RSI\t: %016lx\n", regs->si); - seq_printf(seq, " RDI\t: %016lx\n", regs->di); - seq_printf(seq, " RBP\t: %016lx\n", regs->bp); - seq_printf(seq, " ESP\t: %016lx\n", regs->sp); - -#ifdef CONFIG_X86_64 - seq_printf(seq, " R08\t: %016lx\n", regs->r8); - seq_printf(seq, " R09\t: %016lx\n", regs->r9); - seq_printf(seq, " R10\t: %016lx\n", regs->r10); - seq_printf(seq, " R11\t: %016lx\n", regs->r11); - seq_printf(seq, " R12\t: %016lx\n", regs->r12); - seq_printf(seq, " R13\t: %016lx\n", regs->r13); - seq_printf(seq, " R14\t: %016lx\n", regs->r14); - seq_printf(seq, " R15\t: %016lx\n", regs->r15); -#endif - - asm("movl %%cs,%0" : "=r" (seg)); - seq_printf(seq, " CS\t: %04x\n", seg); - asm("movl %%ds,%0" : "=r" (seg)); - seq_printf(seq, " DS\t: %04x\n", seg); - seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); - asm("movl %%es,%0" : "=r" (seg)); - seq_printf(seq, " ES\t: %04x\n", seg); - asm("movl %%fs,%0" : "=r" (seg)); - seq_printf(seq, " FS\t: %04x\n", seg); - asm("movl %%gs,%0" : "=r" (seg)); - seq_printf(seq, " GS\t: %04x\n", seg); - - seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); - - seq_printf(seq, " EIP\t: %016lx\n", regs->ip); -} - -static void print_cr(void *arg) -{ - struct seq_file *seq = arg; - - seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); - seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); - seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); - seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); -#ifdef CONFIG_X86_64 - seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); -#endif -} - -static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) -{ - seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); -} - -static void print_dt(void *seq) -{ - struct desc_ptr dt; - unsigned long ldt; - - /* IDT */ - store_idt((struct desc_ptr *)&dt); - print_desc_ptr("IDT", seq, dt); - - /* GDT */ - store_gdt((struct desc_ptr *)&dt); - print_desc_ptr("GDT", seq, dt); - - /* LDT */ - store_ldt(ldt); - seq_printf(seq, " LDT\t: %016lx\n", ldt); - - /* TR */ - store_tr(ldt); - seq_printf(seq, " TR\t: %016lx\n", ldt); -} - -static void print_dr(void *arg) -{ - struct seq_file *seq = arg; - unsigned long dr; - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - get_debugreg(dr, i); - seq_printf(seq, " dr%d\t: %016lx\n", i, dr); - } - - seq_printf(seq, "\n MSR\t:\n"); -} - -static void print_apic(void *arg) -{ - struct seq_file *seq = arg; - -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(seq, " LAPIC\t:\n"); - seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); - seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); - seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); - seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); - seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); - seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); - seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); - seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); - seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); - seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); - seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); - seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); - seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); - seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); - seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); - seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); - seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); - seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); - seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); - seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); - seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); - if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { - unsigned int i, v, maxeilvt; - - v = apic_read(APIC_EFEAT); - maxeilvt = (v >> 16) & 0xff; - seq_printf(seq, " EFEAT\t\t: %08x\n", v); - seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); - - for (i = 0; i < maxeilvt; i++) { - v = apic_read(APIC_EILVTn(i)); - seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); - } - } -#endif /* CONFIG_X86_LOCAL_APIC */ - seq_printf(seq, "\n MSR\t:\n"); -} - -static int cpu_seq_show(struct seq_file *seq, void *v) -{ - struct cpu_private *priv = seq->private; - - if (priv == NULL) - return -EINVAL; - - switch (cpu_base[priv->type].flag) { - case CPU_TSS: - smp_call_function_single(priv->cpu, print_tss, seq, 1); - break; - case CPU_CR: - smp_call_function_single(priv->cpu, print_cr, seq, 1); - break; - case CPU_DT: - smp_call_function_single(priv->cpu, print_dt, seq, 1); - break; - case CPU_DEBUG: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_dr, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - case CPU_APIC: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_apic, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - - default: - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - } - seq_printf(seq, "\n"); - - return 0; -} - -static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos == 0) /* One time is enough ;-) */ - return seq; - - return NULL; -} - -static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - (*pos)++; - - return cpu_seq_start(seq, pos); -} - -static void cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations cpu_seq_ops = { - .start = cpu_seq_start, - .next = cpu_seq_next, - .stop = cpu_seq_stop, - .show = cpu_seq_show, -}; - -static int cpu_seq_open(struct inode *inode, struct file *file) -{ - struct cpu_private *priv = inode->i_private; - struct seq_file *seq; - int err; - - err = seq_open(file, &cpu_seq_ops); - if (!err) { - seq = file->private_data; - seq->private = priv; - } - - return err; -} - -static int write_msr(struct cpu_private *priv, u64 val) -{ - u32 low, high; - - high = (val >> 32) & 0xffffffff; - low = val & 0xffffffff; - - if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) - return 0; - - return -EPERM; -} - -static int write_cpu_register(struct cpu_private *priv, const char *buf) -{ - int ret = -EPERM; - u64 val; - - ret = strict_strtoull(buf, 0, &val); - if (ret < 0) - return ret; - - /* Supporting only MSRs */ - if (priv->type < CPU_TSS_BIT) - return write_msr(priv, val); - - return ret; -} - -static ssize_t cpu_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct cpu_private *priv = seq->private; - char buf[19]; - - if ((priv == NULL) || (count >= sizeof(buf))) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) - if (!write_cpu_register(priv, buf)) - return count; - - return -EACCES; -} - -static const struct file_operations cpu_fops = { - .owner = THIS_MODULE, - .open = cpu_seq_open, - .read = seq_read, - .write = cpu_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, - unsigned file, struct dentry *dentry) -{ - struct cpu_private *priv = NULL; - - /* Already intialized */ - if (file == CPU_INDEX_BIT) - if (per_cpu(cpud_arr[type].init, cpu)) - return 0; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv == NULL) - return -ENOMEM; - - priv->cpu = cpu; - priv->type = type; - priv->reg = reg; - priv->file = file; - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_priv_arr[type], cpu) = priv; - per_cpu(cpud_priv_count, cpu)++; - mutex_unlock(&cpu_debug_lock); - - if (file) - debugfs_create_file(cpu_file[file].name, S_IRUGO, - dentry, (void *)priv, &cpu_fops); - else { - debugfs_create_file(cpu_base[type].name, S_IRUGO, - per_cpu(cpud_arr[type].dentry, cpu), - (void *)priv, &cpu_fops); - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_arr[type].init, cpu) = 1; - mutex_unlock(&cpu_debug_lock); - } - - return 0; -} - -static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, - struct dentry *dentry) -{ - unsigned file; - int err = 0; - - for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { - err = cpu_create_file(cpu, type, reg, file, dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned reg, reg_min, reg_max; - int i, err = 0; - char reg_dir[12]; - u32 low, high; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, ®_min, ®_max, i, - cpu_base[type].flag)) - continue; - - for (reg = reg_min; reg <= reg_max; reg++) { - if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) - continue; - - sprintf(reg_dir, "0x%x", reg); - cpu_dentry = debugfs_create_dir(reg_dir, dentry); - err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); - if (err) - return err; - } - } - - return err; -} - -static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned type; - int err = 0; - - for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { - if (!is_typeflag_valid(cpu, cpu_base[type].flag)) - continue; - cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); - per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry; - - if (type < CPU_TSS_BIT) - err = cpu_init_msr(cpu, type, cpu_dentry); - else - err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, - cpu_dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_cpu(void) -{ - struct dentry *cpu_dentry = NULL; - struct cpuinfo_x86 *cpui; - char cpu_dir[12]; - unsigned cpu; - int err = 0; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - cpui = &cpu_data(cpu); - if (!cpu_has(cpui, X86_FEATURE_MSR)) - continue; - - sprintf(cpu_dir, "cpu%d", cpu); - cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); - err = cpu_init_allreg(cpu, cpu_dentry); - - pr_info("cpu%d(%d) debug files %d\n", - cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu)); - if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) { - pr_err("Register files count %d exceeds limit %d\n", - per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES); - per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES; - err = -ENFILE; - } - if (err) - return err; - } - - return err; -} - -static int __init cpu_debug_init(void) -{ - cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); - - return cpu_init_cpu(); -} - -static void __exit cpu_debug_exit(void) -{ - int i, cpu; - - if (cpu_debugfs_dir) - debugfs_remove_recursive(cpu_debugfs_dir); - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++) - kfree(per_cpu(cpud_priv_arr[i], cpu)); -} - -module_init(cpu_debug_init); -module_exit(cpu_debug_exit); - -MODULE_AUTHOR("Jaswinder Singh Rajput"); -MODULE_DESCRIPTION("CPU Debug module"); -MODULE_LICENSE("GPL"); -- cgit v1.2.2 From a5d36f82c4f3e852b61fdf1fee13463c8aa91b90 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Dec 2009 12:42:16 +0200 Subject: KVM: Fix race between APIC TMR and IRR When we queue an interrupt to the local apic, we set the IRR before the TMR. The vcpu can pick up the IRR and inject the interrupt before setting the TMR, and perhaps even EOI it, causing incorrect behaviour. The race is really insignificant since it can only occur on the first interrupt (usually following interrupts will not change TMR), but it's better closed than open. Fixed by reordering setting the TMR vs IRR. Cc: stable@kernel.org Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3063a0c4858b..ba8c045da782 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -373,6 +373,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + result = !apic_test_and_set_irr(vector, apic); trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); @@ -383,11 +389,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); kvm_vcpu_kick(vcpu); break; -- cgit v1.2.2 From 82b7005f0e72d8d1a8226e4c192cbb0850d10b3f Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 5 Jan 2010 19:02:28 +0800 Subject: KVM: x86: Fix host_mapping_level() When found a error hva, should not return PAGE_SIZE but the level... Also clean up the coding style of the following loop. Cc: stable@kernel.org Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4c3e5b2314cb..89a49fb46a27 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -477,7 +477,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return page_size; + return PT_PAGE_TABLE_LEVEL; down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); @@ -515,11 +515,9 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { - + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; - } return level - 1; } -- cgit v1.2.2 From a6085fbaf65ab09bfb5ec8d902d6d21680fe1895 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 14 Jan 2010 17:41:27 -0200 Subject: KVM: MMU: bail out pagewalk on kvm_read_guest error Exit the guest pagetable walk loop if reading gpte failed. Otherwise its possible to enter an endless loop processing the previous present pte. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 58a0f1e88596..ede2131a9225 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -150,7 +150,9 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) + goto not_present; + trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) -- cgit v1.2.2 From 36cb93fd6b6bf7e9163a69a8bf20207aed5fea44 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Jan 2010 14:18:47 +0800 Subject: KVM: x86: Fix probable memory leak of vcpu->arch.mce_banks vcpu->arch.mce_banks is malloc in kvm_arch_vcpu_init(), but never free in any place, this may cause memory leak. So this patch fixed to free it in kvm_arch_vcpu_uninit(). Cc: stable@kernel.org Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6651dbf58675..b265eecc741f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5088,6 +5088,7 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); down_read(&vcpu->kvm->slots_lock); kvm_mmu_destroy(vcpu); -- cgit v1.2.2 From 443c39bc9ef7d8f648408d74c97e943f3bb3f48a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Jan 2010 14:21:29 +0800 Subject: KVM: x86: Fix leak of free lapic date in kvm_arch_vcpu_init() In function kvm_arch_vcpu_init(), if the memory malloc for vcpu->arch.mce_banks is fail, it does not free the memory of lapic date. This patch fixed it. Cc: stable@kernel.org Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b265eecc741f..1ddcad452add 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5072,12 +5072,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) GFP_KERNEL); if (!vcpu->arch.mce_banks) { r = -ENOMEM; - goto fail_mmu_destroy; + goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; return 0; - +fail_free_lapic: + kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); fail_free_pio_data: -- cgit v1.2.2 From d8cc108f4fab42b380c6b3f3356f99e8dd5372e2 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Jan 2010 11:25:36 -0600 Subject: oprofile/x86: fix crash when profiling more than 28 events With multiplexing enabled oprofile crashs when profiling more than 28 events. This patch fixes this. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index cb88b1a0bd5f..76d4f566adee 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -222,7 +222,7 @@ static void nmi_cpu_switch(void *dummy) /* move to next set */ si += model->num_counters; - if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + if ((si >= model->num_virt_counters) || (counter_config[si].count == 0)) per_cpu(switch_index, cpu) = 0; else per_cpu(switch_index, cpu) = si; -- cgit v1.2.2 From e83e452b0692c9c13372540deb88a77d4ae2553d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 21 Jan 2010 23:26:27 +0100 Subject: oprofile/x86: add Xeon 7500 series support Add Xeon 7500 series support to oprofile. Straight forward: it's the same as Core i7, so just detect the model number. No user space changes needed. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 76d4f566adee..3347f696edc7 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -598,6 +598,7 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; + case 0x2e: case 26: spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; -- cgit v1.2.2 From da482474b8396e1a099c37ffc6541b78775aedb4 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Tue, 26 Jan 2010 20:37:22 -0600 Subject: x86, msr/cpuid: Pass the number of minors when unregistering MSR and CPUID drivers. Pass the number of minors when unregistering MSR and CPUID drivers. Reported-by: Dean Nelson Signed-off-by: Dean Nelson LKML-Reference: <20100127023722.GA22305@sgi.com> Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpuid.c | 2 +- arch/x86/kernel/msr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index cb27fd6136c9..83e5e628de73 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -229,7 +229,7 @@ static void __exit cpuid_exit(void) for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4bd93c9b2b27..206735ac8cbd 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -285,7 +285,7 @@ static void __exit msr_exit(void) for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); } -- cgit v1.2.2 From aca3bb5910119d4cf6c28568a642582efb4cc14a Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Fri, 22 Jan 2010 09:41:40 -0600 Subject: x86, UV: Fix RTC latency bug by reading replicated cachelines For SGI UV node controllers (HUB) rev 2.0 or greater, use replicated cachelines to read the RTC timer. This optimization allows faster simulataneous reads from a given socket. Signed-off-by: Dimitri Sivanich Cc: Jack Steiner LKML-Reference: <20100122154140.GB4975@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 3c84aa001c11..2b75ef638dbc 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -282,10 +282,21 @@ static int uv_rtc_unset_timer(int cpu, int force) /* * Read the RTC. + * + * Starting with HUB rev 2.0, the UV RTC register is replicated across all + * cachelines of it's own page. This allows faster simultaneous reads + * from a given socket. */ static cycle_t uv_read_rtc(struct clocksource *cs) { - return (cycle_t)uv_read_local_mmr(UVH_RTC); + unsigned long offset; + + if (uv_get_min_hub_revision_id() == 1) + offset = 0; + else + offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; + + return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); } /* -- cgit v1.2.2 From 2854e72b58dad82f5248b30a45dda0df49e5fe05 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 27 Jan 2010 17:32:22 +0100 Subject: x86: Use helpers for rlimits Make sure compiler won't do weird things with limits. Fetching them twice may return 2 different values after writable limits are implemented. We can either use rlimit helpers added in 3e10e716abf3c71bdb5d86b8f507f9e72236c9cd or ACCESS_ONCE if not applicable; this patch uses the helpers. Signed-off-by: Jiri Slaby LKML-Reference: <1264609942-24621-1-git-send-email-jslaby@suse.cz> Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_aout.c | 2 +- arch/x86/mm/mmap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2a4d073d2cf1..06474788723f 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -297,7 +297,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (ex.a_data + ex.a_bss > rlim) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index c8191defc38a..1dab5194fd9d 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -71,7 +71,7 @@ static int mmap_is_legacy(void) if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void) static unsigned long mmap_base(void) { - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; + unsigned long gap = rlimit(RLIMIT_STACK); if (gap < MIN_GAP) gap = MIN_GAP; -- cgit v1.2.2 From 35ea63d70f827a26c150993b4b940925bb02b03f Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 27 Jan 2010 15:29:18 -0800 Subject: x86: Add Dell OptiPlex 760 reboot quirk Dell OptiPlex 760 hangs on reboot unless reboot=bios is used. Add quirk to reboot through the BIOS. BugLink: https://bugs.launchpad.net/bugs/488319 Signed-off-by: Leann Ogasawara LKML-Reference: <1264634958.27335.1091.camel@emiko> Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1545bc0c9845..704bddcdf64d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", -- cgit v1.2.2 From 439913fffd39374c3737186b22d2d56c3a0ae526 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Thu, 28 Jan 2010 10:53:19 +0800 Subject: ACPI: replace acpi_integer by u64 acpi_integer is now obsolete and removed from the ACPICA code base, replaced by u64. Signed-off-by: Lin Ming Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f125e5c551c0..55d42bc443e8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data) static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - acpi_integer control; + u64 control; if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; @@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val = -ENODEV; - acpi_integer control, status; + u64 control, status; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); @@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, u32 fid; u32 vid; u32 freq, index; - acpi_integer status, control; + u64 status, control; if (data->exttype) { status = data->acpi_data.states[i].status; -- cgit v1.2.2 From 339ce1a4dc2ca26444c4f65c31b71a5056f3bb0b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 18 Jan 2010 16:47:07 +1100 Subject: perf: Fix inconsistency between IP and callchain sampling When running perf across all cpus with backtracing (-a -g), sometimes we get samples without associated backtraces: 23.44% init [kernel] [k] restore 11.46% init eeba0c [k] 0x00000000eeba0c 6.77% swapper [kernel] [k] .perf_ctx_adjust_freq 5.73% init [kernel] [k] .__trace_hcall_entry 4.69% perf libc-2.9.so [.] 0x0000000006bb8c | |--11.11%-- 0xfffa941bbbc It turns out the backtrace code has a check for the idle task and the IP sampling does not. This creates problems when profiling an interrupt heavy workload (in my case 10Gbit ethernet) since we get no backtraces for interrupts received while idle (ie most of the workload). Right now x86 and sh check that current is not NULL, which should never happen so remove that too. Idle task's exclusion must be performed from the core code, on top of perf_event_attr:exclude_idle. Signed-off-by: Anton Blanchard Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Benjamin Herrenschmidt Cc: Paul Mundt LKML-Reference: <20100118054707.GT12666@kryten> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/cpu/perf_event.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b1bb8c550526..ed1998b28a7c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2425,9 +2425,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) is_user = user_mode(regs); - if (!current || current->pid == 0) - return; - if (is_user && current->state != TASK_RUNNING) return; -- cgit v1.2.2 From e8e06eae4ffd683931b928f460c11c40cd3f7fd8 Mon Sep 17 00:00:00 2001 From: Jeff Garrett Date: Wed, 27 Jan 2010 22:02:26 -0600 Subject: x86/PCI: remove IOH range fetching Turned out to cause trouble on single IOH machines, and is superceded by _CRS on multi-IOH machines with production BIOSes. Signed-off-by: Jeff Garrett Signed-off-by: Jesse Barnes --- arch/x86/pci/Makefile | 2 +- arch/x86/pci/intel_bus.c | 94 ------------------------------------------------ 2 files changed, 1 insertion(+), 95 deletions(-) delete mode 100644 arch/x86/pci/intel_bus.c (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 564b008a51c7..39fba37f702f 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o obj-y += amd_bus.o -obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o +obj-$(CONFIG_X86_64) += bus_numa.o ifeq ($(CONFIG_PCI_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c deleted file mode 100644 index f81a2fa8fe25..000000000000 --- a/arch/x86/pci/intel_bus.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * to read io range from IOH pci conf, need to do it after mmconfig is there - */ - -#include -#include -#include -#include -#include - -#include "bus_numa.h" - -static inline void print_ioh_resources(struct pci_root_info *info) -{ - int res_num; - int busnum; - int i; - - printk(KERN_DEBUG "IOH bus: [%02x, %02x]\n", - info->bus_min, info->bus_max); - res_num = info->res_num; - busnum = info->bus_min; - for (i = 0; i < res_num; i++) { - struct resource *res; - - res = &info->res[i]; - printk(KERN_DEBUG "IOH bus: %02x index %x %s: [%llx, %llx]\n", - busnum, i, - (res->flags & IORESOURCE_IO) ? "io port" : - "mmio", - res->start, res->end); - } -} - -#define IOH_LIO 0x108 -#define IOH_LMMIOL 0x10c -#define IOH_LMMIOH 0x110 -#define IOH_LMMIOH_BASEU 0x114 -#define IOH_LMMIOH_LIMITU 0x118 -#define IOH_LCFGBUS 0x11c - -static void __devinit pci_root_bus_res(struct pci_dev *dev) -{ - u16 word; - u32 dword; - struct pci_root_info *info; - u16 io_base, io_end; - u32 mmiol_base, mmiol_end; - u64 mmioh_base, mmioh_end; - int bus_base, bus_end; - - /* some sys doesn't get mmconf enabled */ - if (dev->cfg_size < 0x120) - return; - - if (pci_root_num >= PCI_ROOT_NR) { - printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n"); - return; - } - - info = &pci_root_info[pci_root_num]; - pci_root_num++; - - pci_read_config_word(dev, IOH_LCFGBUS, &word); - bus_base = (word & 0xff); - bus_end = (word & 0xff00) >> 8; - sprintf(info->name, "PCI Bus #%02x", bus_base); - info->bus_min = bus_base; - info->bus_max = bus_end; - - pci_read_config_word(dev, IOH_LIO, &word); - io_base = (word & 0xf0) << (12 - 4); - io_end = (word & 0xf000) | 0xfff; - update_res(info, io_base, io_end, IORESOURCE_IO, 0); - - pci_read_config_dword(dev, IOH_LMMIOL, &dword); - mmiol_base = (dword & 0xff00) << (24 - 8); - mmiol_end = (dword & 0xff000000) | 0xffffff; - update_res(info, mmiol_base, mmiol_end, IORESOURCE_MEM, 0); - - pci_read_config_dword(dev, IOH_LMMIOH, &dword); - mmioh_base = ((u64)(dword & 0xfc00)) << (26 - 10); - mmioh_end = ((u64)(dword & 0xfc000000) | 0x3ffffff); - pci_read_config_dword(dev, IOH_LMMIOH_BASEU, &dword); - mmioh_base |= ((u64)(dword & 0x7ffff)) << 32; - pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword); - mmioh_end |= ((u64)(dword & 0x7ffff)) << 32; - update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0); - - print_ioh_resources(info); -} - -/* intel IOH */ -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x342e, pci_root_bus_res); -- cgit v1.2.2 From 40f9249a73f6c251adea492b1c3d19d39e2a9bda Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Thu, 28 Jan 2010 16:44:01 +0530 Subject: x86/debug: Clear reserved bits of DR6 in do_debug() Clear the reserved bits from the stored copy of debug status register (DR6). This will help easy bitwise operations such as quick testing of a debug event origin. Signed-off-by: K.Prasad Cc: Roland McGrath Cc: Jan Kiszka Cc: Alan Stern Cc: Ingo Molnar LKML-Reference: <20100128111401.GB13935@in.ibm.com> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/debugreg.h | 3 +++ arch/x86/kernel/traps.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 8240f76b531e..b81002f23614 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -14,6 +14,9 @@ which debugging register was responsible for the trap. The other bits are either reserved or not of interest to us. */ +/* Define reserved bits in DR6 which are always set to 1 */ +#define DR6_RESERVED (0xFFFF0FF0) + #define DR_TRAP0 (0x1) /* db0 */ #define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP2 (0x4) /* db2 */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 33399176512a..1168e4454188 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) get_debugreg(dr6, 6); + /* Filter out all the reserved bits which are preset to 1 */ + dr6 &= ~DR6_RESERVED; + /* Catch kmemcheck conditions first of all! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; -- cgit v1.2.2 From e0e53db6133c32964fd17f20b17073a402f07ed3 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Thu, 28 Jan 2010 16:44:15 +0530 Subject: x86/hw-breakpoints: Optimize return code from notifier chain in hw_breakpoint_handler Processing of debug exceptions in do_debug() can stop if it originated from a hw-breakpoint exception by returning NOTIFY_STOP in most cases. But for certain cases such as: a) user-space breakpoints with pending SIGTRAP signal delivery (as in the case of ptrace induced breakpoints). b) exceptions due to other causes than breakpoints We will continue to process the exception by returning NOTIFY_DONE. Signed-off-by: K.Prasad Cc: Ingo Molnar Cc: Roland McGrath Cc: Alan Stern Cc: Jan Kiszka LKML-Reference: <20100128111415.GC13935@in.ibm.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/hw_breakpoint.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 05d5fec64a94..ae90b4739435 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -502,8 +502,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) rcu_read_lock(); bp = per_cpu(bp_per_reg[i], cpu); - if (bp) - rc = NOTIFY_DONE; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -522,7 +520,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) rcu_read_unlock(); } - if (dr6 & (~DR_TRAP_BITS)) + /* + * Further processing in do_debug() is needed for a) user-space + * breakpoints (to generate signals) and b) when the system has + * taken exception due to multiple causes + */ + if ((current->thread.debugreg6 & DR_TRAP_BITS) || + (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; set_debugreg(dr7, 7); -- cgit v1.2.2 From 1da53e023029c067ba1277a33038c65d6e4c99b3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 18 Jan 2010 10:58:01 +0200 Subject: perf_events, x86: Improve x86 event scheduling This patch improves event scheduling by maximizing the use of PMU registers regardless of the order in which events are created in a group. The algorithm takes into account the list of counter constraints for each event. It assigns events to counters from the most constrained, i.e., works on only one counter, to the least constrained, i.e., works on any counter. Intel Fixed counter events and the BTS special event are also handled via this algorithm which is designed to be fairly generic. The patch also updates the validation of an event to use the scheduling algorithm. This will cause early failure in perf_event_open(). The 2nd version of this patch follows the model used by PPC, by running the scheduling algorithm and the actual assignment separately. Actual assignment takes place in hw_perf_enable() whereas scheduling is implemented in hw_perf_group_sched_in() and x86_pmu_enable(). Signed-off-by: Stephane Eranian [ fixup whitespace and style nits as well as adding is_x86_event() ] Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4b5430c6.0f975e0a.1bf9.ffff85fe@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 16 +- arch/x86/kernel/cpu/perf_event.c | 775 +++++++++++++++++++++++++++----------- 2 files changed, 574 insertions(+), 217 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8d9f8548a870..dbc082685d52 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -26,7 +26,14 @@ /* * Includes eventsel and unit mask as well: */ -#define ARCH_PERFMON_EVENT_MASK 0xffff + + +#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL +#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL +#define INTEL_ARCH_EDGE_MASK 0x00040000ULL +#define INTEL_ARCH_INV_MASK 0x00800000ULL +#define INTEL_ARCH_CNT_MASK 0xFF000000ULL +#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK) /* * filter mask to validate fixed counter events. @@ -37,7 +44,12 @@ * The other filters are supported by fixed counters. * The any-thread option is supported starting with v3. */ -#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 +#define INTEL_ARCH_FIXED_MASK \ + (INTEL_ARCH_CNT_MASK| \ + INTEL_ARCH_INV_MASK| \ + INTEL_ARCH_EDGE_MASK|\ + INTEL_ARCH_UNIT_MASK|\ + INTEL_ARCH_EVENT_MASK) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ed1998b28a7c..995ac4ae379c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -7,6 +7,7 @@ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, + * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ @@ -68,26 +69,37 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64)) + +struct event_constraint { + u64 idxmsk[BITS_TO_U64(X86_PMC_IDX_MAX)]; + int code; + int cmask; +}; + struct cpu_hw_events { - struct perf_event *events[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; struct debug_store *ds; -}; -struct event_constraint { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int code; + int n_events; + int n_added; + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } -#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } +#define EVENT_CONSTRAINT(c, n, m) { \ + .code = (c), \ + .cmask = (m), \ + .idxmsk[0] = (n) } -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->idxmsk[0]; (e)++) +#define EVENT_CONSTRAINT_END \ + { .code = 0, .cmask = 0, .idxmsk[0] = 0 } +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -114,8 +126,9 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - int (*get_event_idx)(struct cpu_hw_events *cpuc, - struct hw_perf_event *hwc); + void (*get_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event, u64 *idxmsk); + void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); + const struct event_constraint *event_constraints; }; static struct x86_pmu x86_pmu __read_mostly; @@ -124,7 +137,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static const struct event_constraint *event_constraints; +static int x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx); /* * Not sure about some of these @@ -171,14 +185,14 @@ static u64 p6_pmu_raw_event(u64 hw_event) return hw_event & P6_EVNTSEL_MASK; } -static const struct event_constraint intel_p6_event_constraints[] = +static struct event_constraint intel_p6_event_constraints[] = { - EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0xc1, 0x1, INTEL_ARCH_EVENT_MASK), /* FLOPS */ + EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */ EVENT_CONSTRAINT_END }; @@ -196,32 +210,43 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static const struct event_constraint intel_core_event_constraints[] = -{ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ - EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ - EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ +static struct event_constraint intel_core_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ + EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ + EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x2, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0x18, 0x1, INTEL_ARCH_EVENT_MASK), /* IDLE_DURING_DIV */ + EVENT_CONSTRAINT(0x19, 0x2, INTEL_ARCH_EVENT_MASK), /* DELAYED_BYPASS */ + EVENT_CONSTRAINT(0xa1, 0x1, INTEL_ARCH_EVENT_MASK), /* RS_UOPS_DISPATCH_CYCLES */ + EVENT_CONSTRAINT(0xcb, 0x1, INTEL_ARCH_EVENT_MASK), /* MEM_LOAD_RETIRED */ EVENT_CONSTRAINT_END }; -static const struct event_constraint intel_nehalem_event_constraints[] = -{ - EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ - EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ - EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ - EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ - EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ - EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ +static struct event_constraint intel_nehalem_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ + EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ + EVENT_CONSTRAINT(0x40, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LD */ + EVENT_CONSTRAINT(0x41, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_ST */ + EVENT_CONSTRAINT(0x42, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK */ + EVENT_CONSTRAINT(0x43, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_ALL_REF */ + EVENT_CONSTRAINT(0x4e, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_PREFETCH */ + EVENT_CONSTRAINT(0x4c, 0x3, INTEL_ARCH_EVENT_MASK), /* LOAD_HIT_PRE */ + EVENT_CONSTRAINT(0x51, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D */ + EVENT_CONSTRAINT(0x52, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0x53, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0xc5, 0x3, INTEL_ARCH_EVENT_MASK), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ + EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ EVENT_CONSTRAINT_END }; @@ -527,11 +552,11 @@ static u64 intel_pmu_raw_event(u64 hw_event) #define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_REG_MASK) + (INTEL_ARCH_EVTSEL_MASK | \ + INTEL_ARCH_UNIT_MASK | \ + INTEL_ARCH_EDGE_MASK | \ + INTEL_ARCH_INV_MASK | \ + INTEL_ARCH_CNT_MASK) return hw_event & CORE_EVNTSEL_MASK; } @@ -1120,9 +1145,15 @@ static void amd_pmu_disable_all(void) void hw_perf_disable(void) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + if (!x86_pmu_initialized()) return; - return x86_pmu.disable_all(); + + if (cpuc->enabled) + cpuc->n_added = 0; + + x86_pmu.disable_all(); } static void p6_pmu_enable_all(void) @@ -1189,10 +1220,237 @@ static void amd_pmu_enable_all(void) } } +static const struct pmu pmu; + +static inline int is_x86_event(struct perf_event *event) +{ + return event->pmu == &pmu; +} + +static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +{ + int i, j , w, num; + int weight, wmax; + unsigned long *c; + u64 constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + + for (i = 0; i < n; i++) { + x86_pmu.get_event_constraints(cpuc, + cpuc->event_list[i], + constraints[i]); + } + + /* + * weight = number of possible counters + * + * 1 = most constrained, only works on one counter + * wmax = least constrained, works on any counter + * + * assign events to counters starting with most + * constrained events. + */ + wmax = x86_pmu.num_events; + + /* + * when fixed event counters are present, + * wmax is incremented by 1 to account + * for one more choice + */ + if (x86_pmu.num_events_fixed) + wmax++; + + num = n; + for (w = 1; num && w <= wmax; w++) { + /* for each event */ + for (i = 0; i < n; i++) { + c = (unsigned long *)constraints[i]; + hwc = &cpuc->event_list[i]->hw; + + weight = bitmap_weight(c, X86_PMC_IDX_MAX); + if (weight != w) + continue; + + /* + * try to reuse previous assignment + * + * This is possible despite the fact that + * events or events order may have changed. + * + * What matters is the level of constraints + * of an event and this is constant for now. + * + * This is possible also because we always + * scan from most to least constrained. Thus, + * if a counter can be reused, it means no, + * more constrained events, needed it. And + * next events will either compete for it + * (which cannot be solved anyway) or they + * have fewer constraints, and they can use + * another counter. + */ + j = hwc->idx; + if (j != -1 && !test_bit(j, used_mask)) + goto skip; + + for_each_bit(j, c, X86_PMC_IDX_MAX) { + if (!test_bit(j, used_mask)) + break; + } + + if (j == X86_PMC_IDX_MAX) + break; +skip: + set_bit(j, used_mask); + +#if 0 + pr_debug("CPU%d config=0x%llx idx=%d assign=%c\n", + smp_processor_id(), + hwc->config, + j, + assign ? 'y' : 'n'); +#endif + + if (assign) + assign[i] = j; + num--; + } + } + /* + * scheduling failed or is just a simulation, + * free resources if necessary + */ + if (!assign || num) { + for (i = 0; i < n; i++) { + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + } + } + return num ? -ENOSPC : 0; +} + +/* + * dogrp: true if must collect siblings events (group) + * returns total number of events and error code + */ +static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; + + /* current number of events already accepted */ + n = cpuc->n_events; + + if (is_x86_event(leader)) { + if (n >= max_count) + return -ENOSPC; + cpuc->event_list[n] = leader; + n++; + } + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (!is_x86_event(event) || + event->state == PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -ENOSPC; + + cpuc->event_list[n] = event; + n++; + } + return n; +} + + +static inline void x86_assign_hw_event(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + hwc->idx = idx; + + if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + hwc->config_base = 0; + hwc->event_base = 0; + } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that event_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->event_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + } else { + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; + } +} + void hw_perf_enable(void) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + struct hw_perf_event *hwc; + int i; + if (!x86_pmu_initialized()) return; + if (cpuc->n_added) { + /* + * apply assignment obtained either from + * hw_perf_group_sched_in() or x86_pmu_enable() + * + * step1: save events moving to new counters + * step2: reprogram moved events into new counters + */ + for (i = 0; i < cpuc->n_events; i++) { + + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == -1 || hwc->idx == cpuc->assign[i]) + continue; + + x86_pmu.disable(hwc, hwc->idx); + + clear_bit(hwc->idx, cpuc->active_mask); + barrier(); + cpuc->events[hwc->idx] = NULL; + + x86_perf_event_update(event, hwc, hwc->idx); + + hwc->idx = -1; + } + + for (i = 0; i < cpuc->n_events; i++) { + + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == -1) { + x86_assign_hw_event(event, hwc, cpuc->assign[i]); + x86_perf_event_set_period(event, hwc, hwc->idx); + } + /* + * need to mark as active because x86_pmu_disable() + * clear active_mask and eventsp[] yet it preserves + * idx + */ + set_bit(hwc->idx, cpuc->active_mask); + cpuc->events[hwc->idx] = event; + + x86_pmu.enable(hwc, hwc->idx); + perf_event_update_userpage(event); + } + cpuc->n_added = 0; + perf_events_lapic_init(); + } x86_pmu.enable_all(); } @@ -1391,148 +1649,43 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) x86_pmu_enable_event(hwc, idx); } -static int fixed_mode_idx(struct hw_perf_event *hwc) -{ - unsigned int hw_event; - - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; - - if (!x86_pmu.num_events_fixed) - return -1; - - /* - * fixed counters do not take all possible filters - */ - if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) - return -1; - - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; - - return -1; -} - -/* - * generic counter allocator: get next free counter - */ -static int -gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); - return idx == x86_pmu.num_events ? -1 : idx; -} - /* - * intel-specific counter allocator: check event constraints - */ -static int -intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - const struct event_constraint *event_constraint; - int i, code; - - if (!event_constraints) - goto skip; - - code = hwc->config & CORE_EVNTSEL_EVENT_MASK; - - for_each_event_constraint(event_constraint, event_constraints) { - if (code == event_constraint->code) { - for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { - if (!test_and_set_bit(i, cpuc->used_mask)) - return i; - } - return -1; - } - } -skip: - return gen_get_event_idx(cpuc, hwc); -} - -static int -x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = fixed_mode_idx(hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->event_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed event, if that is already taken - * then try to get a generic event: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; - - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic event again */ - if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = x86_pmu.get_event_idx(cpuc, hwc); - if (idx == -1) - return -EAGAIN; - - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; - } - - return idx; -} - -/* - * Find a PMC slot for the freshly enabled / scheduled in event: + * activate a single event + * + * The event is added to the group of enabled events + * but only if it can be scehduled with existing events. + * + * Called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() */ static int x86_pmu_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int idx; + struct hw_perf_event *hwc; + int assign[X86_PMC_IDX_MAX]; + int n, n0, ret; - idx = x86_schedule_event(cpuc, hwc); - if (idx < 0) - return idx; + hwc = &event->hw; - perf_events_lapic_init(); + n0 = cpuc->n_events; + n = collect_events(cpuc, event, false); + if (n < 0) + return n; - x86_pmu.disable(hwc, idx); - - cpuc->events[idx] = event; - set_bit(idx, cpuc->active_mask); + ret = x86_schedule_events(cpuc, n, assign); + if (ret) + return ret; + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n*sizeof(int)); - x86_perf_event_set_period(event, hwc, idx); - x86_pmu.enable(hwc, idx); + cpuc->n_events = n; + cpuc->n_added = n - n0; - perf_event_update_userpage(event); + if (hwc->idx != -1) + x86_perf_event_set_period(event, hwc, hwc->idx); return 0; } @@ -1576,7 +1729,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -1664,7 +1817,7 @@ static void x86_pmu_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; + int i, idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler @@ -1690,8 +1843,19 @@ static void x86_pmu_disable(struct perf_event *event) intel_pmu_drain_bts_buffer(cpuc); cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); + for (i = 0; i < cpuc->n_events; i++) { + if (event == cpuc->event_list[i]) { + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); + + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; + + --cpuc->n_events; + } + } perf_event_update_userpage(event); } @@ -1962,6 +2126,176 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } +static struct event_constraint bts_constraint = { + .code = 0, + .cmask = 0, + .idxmsk[0] = 1ULL << X86_PMC_IDX_FIXED_BTS +}; + +static int intel_special_constraints(struct perf_event *event, + u64 *idxmsk) +{ + unsigned int hw_event; + + hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (event->hw.sample_period == 1))) { + + bitmap_copy((unsigned long *)idxmsk, + (unsigned long *)bts_constraint.idxmsk, + X86_PMC_IDX_MAX); + return 1; + } + return 0; +} + +static void intel_get_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event, + u64 *idxmsk) +{ + const struct event_constraint *c; + + /* + * cleanup bitmask + */ + bitmap_zero((unsigned long *)idxmsk, X86_PMC_IDX_MAX); + + if (intel_special_constraints(event, idxmsk)) + return; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) { + + bitmap_copy((unsigned long *)idxmsk, + (unsigned long *)c->idxmsk, + X86_PMC_IDX_MAX); + return; + } + } + } + /* no constraints, means supports all generic counters */ + bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events); +} + +static void amd_get_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event, + u64 *idxmsk) +{ +} + +static int x86_event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, int cpu) +{ + int ret = 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; + event->tstamp_running += event->ctx->time - event->tstamp_stopped; + + if (!is_x86_event(event)) + ret = event->pmu->enable(event); + + if (!ret && !is_software_event(event)) + cpuctx->active_oncpu++; + + if (!ret && event->attr.exclusive) + cpuctx->exclusive = 1; + + return ret; +} + +static void x86_event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, int cpu) +{ + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + + if (!is_x86_event(event)) + event->pmu->disable(event); + + event->tstamp_running -= event->ctx->time - event->tstamp_stopped; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +/* + * Called to enable a whole group of events. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + * + * called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() + */ +int hw_perf_group_sched_in(struct perf_event *leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct perf_event *sub; + int assign[X86_PMC_IDX_MAX]; + int n0, n1, ret; + + /* n0 = total number of events */ + n0 = collect_events(cpuc, leader, true); + if (n0 < 0) + return n0; + + ret = x86_schedule_events(cpuc, n0, assign); + if (ret) + return ret; + + ret = x86_event_sched_in(leader, cpuctx, cpu); + if (ret) + return ret; + + n1 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state != PERF_EVENT_STATE_OFF) { + ret = x86_event_sched_in(sub, cpuctx, cpu); + if (ret) + goto undo; + ++n1; + } + } + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n0*sizeof(int)); + + cpuc->n_events = n0; + cpuc->n_added = n1; + ctx->nr_active += n1; + + /* + * 1 means successful and events are active + * This is not quite true because we defer + * actual activation until hw_perf_enable() but + * this way we* ensure caller won't try to enable + * individual events + */ + return 1; +undo: + x86_event_sched_out(leader, cpuctx, cpu); + n0 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + x86_event_sched_out(sub, cpuctx, cpu); + if (++n0 == n1) + break; + } + } + return ret; +} + static __read_mostly struct notifier_block perf_event_nmi_notifier = { .notifier_call = perf_event_nmi_handler, .next = NULL, @@ -1993,7 +2327,8 @@ static __initconst struct x86_pmu p6_pmu = { */ .event_bits = 32, .event_mask = (1ULL << 32) - 1, - .get_event_idx = intel_get_event_idx, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_p6_event_constraints }; static __initconst struct x86_pmu intel_pmu = { @@ -2017,7 +2352,7 @@ static __initconst struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, - .get_event_idx = intel_get_event_idx, + .get_event_constraints = intel_get_event_constraints }; static __initconst struct x86_pmu amd_pmu = { @@ -2038,7 +2373,7 @@ static __initconst struct x86_pmu amd_pmu = { .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_idx = gen_get_event_idx, + .get_event_constraints = amd_get_event_constraints }; static __init int p6_pmu_init(void) @@ -2051,12 +2386,9 @@ static __init int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ - event_constraints = intel_p6_event_constraints; - break; case 9: case 13: /* Pentium M */ - event_constraints = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -2121,23 +2453,29 @@ static __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + x86_pmu.event_constraints = intel_core_event_constraints; pr_cont("Core2 events, "); - event_constraints = intel_core_event_constraints; break; - default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - event_constraints = intel_nehalem_event_constraints; + x86_pmu.event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("Atom events, "); break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); } return 0; } @@ -2234,36 +2572,43 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; -static int -validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct hw_perf_event fake_event = event->hw; - - if (event->pmu && event->pmu != &pmu) - return 0; - - return x86_schedule_event(cpuc, &fake_event) >= 0; -} - +/* + * validate a single event group + * + * validation include: + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters + * + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ static int validate_group(struct perf_event *event) { - struct perf_event *sibling, *leader = event->group_leader; - struct cpu_hw_events fake_pmu; + struct perf_event *leader = event->group_leader; + struct cpu_hw_events fake_cpuc; + int n; - memset(&fake_pmu, 0, sizeof(fake_pmu)); + memset(&fake_cpuc, 0, sizeof(fake_cpuc)); - if (!validate_event(&fake_pmu, leader)) + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = collect_events(&fake_cpuc, leader, true); + if (n < 0) return -ENOSPC; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { - if (!validate_event(&fake_pmu, sibling)) - return -ENOSPC; - } - - if (!validate_event(&fake_pmu, event)) + fake_cpuc.n_events = n; + n = collect_events(&fake_cpuc, event, false); + if (n < 0) return -ENOSPC; - return 0; + fake_cpuc.n_events = n; + + return x86_schedule_events(&fake_cpuc, n, NULL); } const struct pmu *hw_perf_event_init(struct perf_event *event) -- cgit v1.2.2 From 8113070d6639d2245c6c79afb8df42cedab30540 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 21 Jan 2010 17:39:01 +0200 Subject: perf_events: Add fast-path to the rescheduling code Implement correct fastpath scheduling, i.e., reuse previous assignment. Signed-off-by: Stephane Eranian [ split from larger patch] Signed-off-by: Peter Zijlstra LKML-Reference: <4b588464.1818d00a.4456.383b@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 91 +++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 995ac4ae379c..0bd23d01af34 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1244,6 +1244,46 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) constraints[i]); } + /* + * fastpath, try to reuse previous register + */ + for (i = 0, num = n; i < n; i++, num--) { + hwc = &cpuc->event_list[i]->hw; + c = (unsigned long *)constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + +#if 0 + pr_debug("CPU%d fast config=0x%llx idx=%d assign=%c\n", + smp_processor_id(), + hwc->config, + hwc->idx, + assign ? 'y' : 'n'); +#endif + + set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + if (!num) + goto done; + + /* + * begin slow path + */ + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + /* * weight = number of possible counters * @@ -1263,10 +1303,9 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (x86_pmu.num_events_fixed) wmax++; - num = n; - for (w = 1; num && w <= wmax; w++) { + for (w = 1, num = n; num && w <= wmax; w++) { /* for each event */ - for (i = 0; i < n; i++) { + for (i = 0; num && i < n; i++) { c = (unsigned long *)constraints[i]; hwc = &cpuc->event_list[i]->hw; @@ -1274,28 +1313,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (weight != w) continue; - /* - * try to reuse previous assignment - * - * This is possible despite the fact that - * events or events order may have changed. - * - * What matters is the level of constraints - * of an event and this is constant for now. - * - * This is possible also because we always - * scan from most to least constrained. Thus, - * if a counter can be reused, it means no, - * more constrained events, needed it. And - * next events will either compete for it - * (which cannot be solved anyway) or they - * have fewer constraints, and they can use - * another counter. - */ - j = hwc->idx; - if (j != -1 && !test_bit(j, used_mask)) - goto skip; - for_each_bit(j, c, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; @@ -1303,22 +1320,23 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; -skip: - set_bit(j, used_mask); #if 0 - pr_debug("CPU%d config=0x%llx idx=%d assign=%c\n", + pr_debug("CPU%d slow config=0x%llx idx=%d assign=%c\n", smp_processor_id(), hwc->config, j, assign ? 'y' : 'n'); #endif + set_bit(j, used_mask); + if (assign) assign[i] = j; num--; } } +done: /* * scheduling failed or is just a simulation, * free resources if necessary @@ -1357,7 +1375,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, list_for_each_entry(event, &leader->sibling_list, group_entry) { if (!is_x86_event(event) || - event->state == PERF_EVENT_STATE_OFF) + event->state <= PERF_EVENT_STATE_OFF) continue; if (n >= max_count) @@ -2184,6 +2202,8 @@ static void amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, u64 *idxmsk) { + /* no constraints, means supports all generic counters */ + bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events); } static int x86_event_sched_in(struct perf_event *event, @@ -2258,7 +2278,7 @@ int hw_perf_group_sched_in(struct perf_event *leader, n1 = 1; list_for_each_entry(sub, &leader->sibling_list, group_entry) { - if (sub->state != PERF_EVENT_STATE_OFF) { + if (sub->state > PERF_EVENT_STATE_OFF) { ret = x86_event_sched_in(sub, cpuctx, cpu); if (ret) goto undo; @@ -2613,12 +2633,23 @@ static int validate_group(struct perf_event *event) const struct pmu *hw_perf_event_init(struct perf_event *event) { + const struct pmu *tmp; int err; err = __hw_perf_event_init(event); if (!err) { + /* + * we temporarily connect event to its pmu + * such that validate_group() can classify + * it as an x86 event using is_x86_event() + */ + tmp = event->pmu; + event->pmu = &pmu; + if (event->group_leader != event) err = validate_group(event); + + event->pmu = tmp; } if (err) { if (event->destroy) -- cgit v1.2.2 From 502568d563bcc37ac505a83341c0c95b88c015a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 14:35:46 +0100 Subject: perf_event: x86: Allocate the fake_cpuc GCC was complaining the stack usage was too large, so allocate the structure. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100122155535.411197266@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0bd23d01af34..7bd359a57839 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2606,10 +2606,13 @@ static const struct pmu pmu = { static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; - struct cpu_hw_events fake_cpuc; - int n; + struct cpu_hw_events *fake_cpuc; + int ret, n; - memset(&fake_cpuc, 0, sizeof(fake_cpuc)); + ret = -ENOMEM; + fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); + if (!fake_cpuc) + goto out; /* * the event is not yet connected with its @@ -2617,18 +2620,24 @@ static int validate_group(struct perf_event *event) * existing siblings, then add the new event * before we can simulate the scheduling */ - n = collect_events(&fake_cpuc, leader, true); + ret = -ENOSPC; + n = collect_events(fake_cpuc, leader, true); if (n < 0) - return -ENOSPC; + goto out_free; - fake_cpuc.n_events = n; - n = collect_events(&fake_cpuc, event, false); + fake_cpuc->n_events = n; + n = collect_events(fake_cpuc, event, false); if (n < 0) - return -ENOSPC; + goto out_free; - fake_cpuc.n_events = n; + fake_cpuc->n_events = n; - return x86_schedule_events(&fake_cpuc, n, NULL); + ret = x86_schedule_events(fake_cpuc, n, NULL); + +out_free: + kfree(fake_cpuc); +out: + return ret; } const struct pmu *hw_perf_event_init(struct perf_event *event) -- cgit v1.2.2 From 81269a085669b5130058a0275aa7ba9f94abd1fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 14:55:22 +0100 Subject: perf_event: x86: Fixup constraints typing issue Constraints gets defined an u64 but in long quantities and then cast to long. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100122155535.504916780@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 7bd359a57839..7e181a5097ea 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1232,7 +1232,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) int i, j , w, num; int weight, wmax; unsigned long *c; - u64 constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct hw_perf_event *hwc; @@ -1249,7 +1249,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0, num = n; i < n; i++, num--) { hwc = &cpuc->event_list[i]->hw; - c = (unsigned long *)constraints[i]; + c = constraints[i]; /* never assigned */ if (hwc->idx == -1) @@ -1306,7 +1306,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) for (w = 1, num = n; num && w <= wmax; w++) { /* for each event */ for (i = 0; num && i < n; i++) { - c = (unsigned long *)constraints[i]; + c = constraints[i]; hwc = &cpuc->event_list[i]->hw; weight = bitmap_weight(c, X86_PMC_IDX_MAX); -- cgit v1.2.2 From c91e0f5da81c6f3a611a1bd6d0cca6717c90fdab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 15:25:59 +0100 Subject: perf_event: x86: Clean up some of the u64/long bitmask casting We need this to be u64 for direct assigment, but the bitmask functions all work on unsigned long, leading to cast heaven, solve this by using a union. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100122155535.595961269@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 47 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 7e181a5097ea..921bbf732e77 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -69,10 +69,11 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; -#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64)) - struct event_constraint { - u64 idxmsk[BITS_TO_U64(X86_PMC_IDX_MAX)]; + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64[1]; + }; int code; int cmask; }; @@ -90,13 +91,14 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ - .code = (c), \ - .cmask = (m), \ - .idxmsk[0] = (n) } +#define EVENT_CONSTRAINT(c, n, m) { \ + { .idxmsk64[0] = (n) }, \ + .code = (c), \ + .cmask = (m), \ +} #define EVENT_CONSTRAINT_END \ - { .code = 0, .cmask = 0, .idxmsk[0] = 0 } + EVENT_CONSTRAINT(0, 0, 0) #define for_each_event_constraint(e, c) \ for ((e) = (c); (e)->cmask; (e)++) @@ -126,8 +128,11 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - void (*get_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event, u64 *idxmsk); - void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); + void (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event, + unsigned long *idxmsk); + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); const struct event_constraint *event_constraints; }; @@ -2144,14 +2149,11 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } -static struct event_constraint bts_constraint = { - .code = 0, - .cmask = 0, - .idxmsk[0] = 1ULL << X86_PMC_IDX_FIXED_BTS -}; +static struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); static int intel_special_constraints(struct perf_event *event, - u64 *idxmsk) + unsigned long *idxmsk) { unsigned int hw_event; @@ -2171,14 +2173,14 @@ static int intel_special_constraints(struct perf_event *event, static void intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, - u64 *idxmsk) + unsigned long *idxmsk) { const struct event_constraint *c; /* * cleanup bitmask */ - bitmap_zero((unsigned long *)idxmsk, X86_PMC_IDX_MAX); + bitmap_zero(idxmsk, X86_PMC_IDX_MAX); if (intel_special_constraints(event, idxmsk)) return; @@ -2186,10 +2188,7 @@ static void intel_get_event_constraints(struct cpu_hw_events *cpuc, if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { if ((event->hw.config & c->cmask) == c->code) { - - bitmap_copy((unsigned long *)idxmsk, - (unsigned long *)c->idxmsk, - X86_PMC_IDX_MAX); + bitmap_copy(idxmsk, c->idxmsk, X86_PMC_IDX_MAX); return; } } @@ -2200,10 +2199,10 @@ static void intel_get_event_constraints(struct cpu_hw_events *cpuc, static void amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, - u64 *idxmsk) + unsigned long *idxmsk) { /* no constraints, means supports all generic counters */ - bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events); + bitmap_fill(idxmsk, x86_pmu.num_events); } static int x86_event_sched_in(struct perf_event *event, -- cgit v1.2.2 From 8433be1184e4f22c37d4b8ed36cde529a47882f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 15:38:26 +0100 Subject: perf_event: x86: Reduce some overly long lines with some MACROs Introduce INTEL_EVENT_CONSTRAINT and FIXED_EVENT_CONSTRAINT to reduce some line length and typing work. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100122155535.688730371@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 68 ++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 921bbf732e77..4d1ed101c10d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -97,6 +97,12 @@ struct cpu_hw_events { .cmask = (m), \ } +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) + +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) + #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) @@ -192,12 +198,12 @@ static u64 p6_pmu_raw_event(u64 hw_event) static struct event_constraint intel_p6_event_constraints[] = { - EVENT_CONSTRAINT(0xc1, 0x1, INTEL_ARCH_EVENT_MASK), /* FLOPS */ - EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ EVENT_CONSTRAINT_END }; @@ -217,41 +223,41 @@ static const u64 intel_perfmon_event_map[] = static struct event_constraint intel_core_event_constraints[] = { - EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ - EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ - EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x2, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT(0x18, 0x1, INTEL_ARCH_EVENT_MASK), /* IDLE_DURING_DIV */ - EVENT_CONSTRAINT(0x19, 0x2, INTEL_ARCH_EVENT_MASK), /* DELAYED_BYPASS */ - EVENT_CONSTRAINT(0xa1, 0x1, INTEL_ARCH_EVENT_MASK), /* RS_UOPS_DISPATCH_CYCLES */ - EVENT_CONSTRAINT(0xcb, 0x1, INTEL_ARCH_EVENT_MASK), /* MEM_LOAD_RETIRED */ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_nehalem_event_constraints[] = { - EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ - EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ - EVENT_CONSTRAINT(0x40, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LD */ - EVENT_CONSTRAINT(0x41, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_ST */ - EVENT_CONSTRAINT(0x42, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK */ - EVENT_CONSTRAINT(0x43, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_ALL_REF */ - EVENT_CONSTRAINT(0x4e, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_PREFETCH */ - EVENT_CONSTRAINT(0x4c, 0x3, INTEL_ARCH_EVENT_MASK), /* LOAD_HIT_PRE */ - EVENT_CONSTRAINT(0x51, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D */ - EVENT_CONSTRAINT(0x52, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0x53, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0xc5, 0x3, INTEL_ARCH_EVENT_MASK), /* CACHE_LOCK_CYCLES */ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + INTEL_EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ + INTEL_EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_gen_event_constraints[] = { - EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ - EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ EVENT_CONSTRAINT_END }; -- cgit v1.2.2 From 63b146490befc027a7e0923e333269e68b20d380 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 16:32:17 +0100 Subject: perf_event: x86: Optimize the constraint searching bits Instead of copying bitmasks around, pass pointers to the constraint structure. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100122155535.887853503@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 75 ++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4d1ed101c10d..092ad566734c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -134,12 +134,14 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - void (*get_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event, - unsigned long *idxmsk); + + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); - const struct event_constraint *event_constraints; + struct event_constraint *event_constraints; }; static struct x86_pmu x86_pmu __read_mostly; @@ -1242,17 +1244,15 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { int i, j , w, num; int weight, wmax; - unsigned long *c; - unsigned long constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); for (i = 0; i < n; i++) { - x86_pmu.get_event_constraints(cpuc, - cpuc->event_list[i], - constraints[i]); + constraints[i] = + x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); } /* @@ -1267,7 +1267,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) break; /* constraint still honored */ - if (!test_bit(hwc->idx, c)) + if (!test_bit(hwc->idx, c->idxmsk)) break; /* not already used */ @@ -1320,11 +1320,11 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) c = constraints[i]; hwc = &cpuc->event_list[i]->hw; - weight = bitmap_weight(c, X86_PMC_IDX_MAX); + weight = bitmap_weight(c->idxmsk, X86_PMC_IDX_MAX); if (weight != w) continue; - for_each_bit(j, c, X86_PMC_IDX_MAX) { + for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; } @@ -2155,11 +2155,13 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } +static struct event_constraint unconstrained; + static struct event_constraint bts_constraint = EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); -static int intel_special_constraints(struct perf_event *event, - unsigned long *idxmsk) +static struct event_constraint * +intel_special_constraints(struct perf_event *event) { unsigned int hw_event; @@ -2169,46 +2171,34 @@ static int intel_special_constraints(struct perf_event *event, x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && (event->hw.sample_period == 1))) { - bitmap_copy((unsigned long *)idxmsk, - (unsigned long *)bts_constraint.idxmsk, - X86_PMC_IDX_MAX); - return 1; + return &bts_constraint; } - return 0; + return NULL; } -static void intel_get_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event, - unsigned long *idxmsk) +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - const struct event_constraint *c; + struct event_constraint *c; - /* - * cleanup bitmask - */ - bitmap_zero(idxmsk, X86_PMC_IDX_MAX); - - if (intel_special_constraints(event, idxmsk)) - return; + c = intel_special_constraints(event); + if (c) + return c; if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) { - bitmap_copy(idxmsk, c->idxmsk, X86_PMC_IDX_MAX); - return; - } + if ((event->hw.config & c->cmask) == c->code) + return c; } } - /* no constraints, means supports all generic counters */ - bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events); + + return &unconstrained; } -static void amd_get_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event, - unsigned long *idxmsk) +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - /* no constraints, means supports all generic counters */ - bitmap_fill(idxmsk, x86_pmu.num_events); + return &unconstrained; } static int x86_event_sched_in(struct perf_event *event, @@ -2576,6 +2566,9 @@ void __init init_hw_perf_events(void) perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); + unconstrained = (struct event_constraint) + EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0); + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); -- cgit v1.2.2 From 272d30be622c9c6cbd514b1211ff359292001baa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 16:32:17 +0100 Subject: perf_event: x86: Optimize constraint weight computation Add a weight member to the constraint structure and avoid recomputing the weight at runtime. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100122155535.963944926@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 092ad566734c..2c22ce4fa784 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -76,6 +77,7 @@ struct event_constraint { }; int code; int cmask; + int weight; }; struct cpu_hw_events { @@ -95,6 +97,7 @@ struct cpu_hw_events { { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ + .weight = HWEIGHT64((u64)(n)), \ } #define INTEL_EVENT_CONSTRAINT(c, n) \ @@ -1242,8 +1245,7 @@ static inline int is_x86_event(struct perf_event *event) static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - int i, j , w, num; - int weight, wmax; + int i, j, w, num, wmax; struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct hw_perf_event *hwc; @@ -1320,8 +1322,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) c = constraints[i]; hwc = &cpuc->event_list[i]->hw; - weight = bitmap_weight(c->idxmsk, X86_PMC_IDX_MAX); - if (weight != w) + if (c->weight != w) continue; for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { -- cgit v1.2.2 From c933c1a603d5bf700ddce79216c1be0ec3bc0e6c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 16:40:12 +0100 Subject: perf_event: x86: Optimize the fast path a little more Remove num from the fast path and save a few ops. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100122155536.056430539@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2c22ce4fa784..33c889ff21ae 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1245,9 +1245,9 @@ static inline int is_x86_event(struct perf_event *event) static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - int i, j, w, num, wmax; struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int i, j, w, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); @@ -1260,7 +1260,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* * fastpath, try to reuse previous register */ - for (i = 0, num = n; i < n; i++, num--) { + for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; c = constraints[i]; @@ -1288,7 +1288,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (assign) assign[i] = hwc->idx; } - if (!num) + if (i == n) goto done; /* -- cgit v1.2.2 From 6c9687abeb24d5b7aae7db5be070c2139ad29e29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jan 2010 11:57:25 +0100 Subject: perf_event: x86: Optimize x86_pmu_disable() x86_pmu_disable() removes the event from the cpuc->event_list[], however since an event can only be on that list once, stop looking after we found it. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 33c889ff21ae..66de282ad2fb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1884,6 +1884,7 @@ static void x86_pmu_disable(struct perf_event *event) cpuc->event_list[i-1] = cpuc->event_list[i]; --cpuc->n_events; + break; } } perf_event_update_userpage(event); -- cgit v1.2.2 From 184f412c3341cd24fbd26604634a5800b83dbdc3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Jan 2010 08:39:39 +0100 Subject: perf, x86: Clean up event constraints code a bit - Remove stray debug code - Improve ugly macros a bit - Remove some whitespace damage - (Also fix up some accumulated damage in perf_event.h) Signed-off-by: Ingo Molnar Cc: Stephane Eranian Cc: Peter Zijlstra LKML-Reference: --- arch/x86/kernel/cpu/perf_event.c | 37 ++++++++----------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66de282ad2fb..fdbe24842271 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,24 +93,19 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ +#define EVENT_CONSTRAINT(c, n, m) { \ { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = HWEIGHT64((u64)(n)), \ } -#define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define INTEL_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define FIXED_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) -#define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) +#define EVENT_CONSTRAINT_END EVENT_CONSTRAINT(0, 0, 0) -#define EVENT_CONSTRAINT_END \ - EVENT_CONSTRAINT(0, 0, 0) - -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->cmask; (e)++) +#define for_each_event_constraint(e, c) for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -1276,14 +1271,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (test_bit(hwc->idx, used_mask)) break; -#if 0 - pr_debug("CPU%d fast config=0x%llx idx=%d assign=%c\n", - smp_processor_id(), - hwc->config, - hwc->idx, - assign ? 'y' : 'n'); -#endif - set_bit(hwc->idx, used_mask); if (assign) assign[i] = hwc->idx; @@ -1333,14 +1320,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; -#if 0 - pr_debug("CPU%d slow config=0x%llx idx=%d assign=%c\n", - smp_processor_id(), - hwc->config, - j, - assign ? 'y' : 'n'); -#endif - set_bit(j, used_mask); if (assign) @@ -2596,9 +2575,9 @@ static const struct pmu pmu = { * validate a single event group * * validation include: - * - check events are compatible which each other - * - events do not compete for the same counter - * - number of events <= number of counters + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. -- cgit v1.2.2 From 2e8418736dff9c6fdadb2f87dcc2087cebf32167 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jan 2010 15:58:43 +0100 Subject: perf_event: x86: Deduplicate the disable code Share the meat of the x86_pmu_disable() code with hw_perf_enable(). Also remove the barrier() from that code, since I could not convince myself we actually need it. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fdbe24842271..07fa0c2faa09 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1401,6 +1401,8 @@ static inline void x86_assign_hw_event(struct perf_event *event, } } +static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc); + void hw_perf_enable(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1426,13 +1428,7 @@ void hw_perf_enable(void) if (hwc->idx == -1 || hwc->idx == cpuc->assign[i]) continue; - x86_pmu.disable(hwc, hwc->idx); - - clear_bit(hwc->idx, cpuc->active_mask); - barrier(); - cpuc->events[hwc->idx] = NULL; - - x86_perf_event_update(event, hwc, hwc->idx); + __x86_pmu_disable(event, cpuc); hwc->idx = -1; } @@ -1822,11 +1818,10 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) event->pending_kill = POLL_IN; } -static void x86_pmu_disable(struct perf_event *event) +static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int i, idx = hwc->idx; + int idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler @@ -1835,12 +1830,6 @@ static void x86_pmu_disable(struct perf_event *event) clear_bit(idx, cpuc->active_mask); x86_pmu.disable(hwc, idx); - /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the event: - */ - barrier(); - /* * Drain the remaining delta count out of a event * that we are disabling: @@ -1852,6 +1841,14 @@ static void x86_pmu_disable(struct perf_event *event) intel_pmu_drain_bts_buffer(cpuc); cpuc->events[idx] = NULL; +} + +static void x86_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; + + __x86_pmu_disable(event, cpuc); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { -- cgit v1.2.2 From ed8777fc132e589d48a0ba854fdbb5d8203b58e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Jan 2010 23:07:46 +0100 Subject: perf_events, x86: Fix event constraint masks Since constraints are specified on the event number, not number and unit mask shorten the constraint masks so that we'll actually match something. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100127221121.967610372@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 +- arch/x86/kernel/cpu/perf_event.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index dbc082685d52..ff5ede128bae 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -49,7 +49,7 @@ INTEL_ARCH_INV_MASK| \ INTEL_ARCH_EDGE_MASK|\ INTEL_ARCH_UNIT_MASK|\ - INTEL_ARCH_EVENT_MASK) + INTEL_ARCH_EVTSEL_MASK) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 07fa0c2faa09..951213a51489 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -100,12 +100,17 @@ struct cpu_hw_events { .weight = HWEIGHT64((u64)(n)), \ } -#define INTEL_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) -#define FIXED_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) -#define EVENT_CONSTRAINT_END EVENT_CONSTRAINT(0, 0, 0) +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) -#define for_each_event_constraint(e, c) for ((e) = (c); (e)->cmask; (e)++) +#define EVENT_CONSTRAINT_END \ + EVENT_CONSTRAINT(0, 0, 0) + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu -- cgit v1.2.2 From 1a6e21f791fe85b40a9ddbafe999ab8ccffc3f78 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Jan 2010 23:07:47 +0100 Subject: perf_events, x86: Clean up hw_perf_*_all() implementation Put the recursion avoidance code in the generic hook instead of replicating it in each implementation. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100127221122.057507285@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 59 ++++++++++------------------------------ 1 file changed, 14 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 951213a51489..cf10839f20ea 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1099,15 +1099,8 @@ static int __hw_perf_event_init(struct perf_event *event) static void p6_pmu_disable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -1118,12 +1111,6 @@ static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) @@ -1135,17 +1122,6 @@ static void amd_pmu_disable_all(void) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - /* - * ensure we write the disable before we start disabling the - * events proper, so that amd_pmu_enable_event() does the - * right thing. - */ - barrier(); - for (idx = 0; idx < x86_pmu.num_events; idx++) { u64 val; @@ -1166,23 +1142,20 @@ void hw_perf_disable(void) if (!x86_pmu_initialized()) return; - if (cpuc->enabled) - cpuc->n_added = 0; + if (!cpuc->enabled) + return; + + cpuc->n_added = 0; + cpuc->enabled = 0; + barrier(); x86_pmu.disable_all(); } static void p6_pmu_enable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long val; - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -1193,12 +1166,6 @@ static void intel_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { @@ -1217,12 +1184,6 @@ static void amd_pmu_enable_all(void) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - for (idx = 0; idx < x86_pmu.num_events; idx++) { struct perf_event *event = cpuc->events[idx]; u64 val; @@ -1417,6 +1378,10 @@ void hw_perf_enable(void) if (!x86_pmu_initialized()) return; + + if (cpuc->enabled) + return; + if (cpuc->n_added) { /* * apply assignment obtained either from @@ -1461,6 +1426,10 @@ void hw_perf_enable(void) cpuc->n_added = 0; perf_events_lapic_init(); } + + cpuc->enabled = 1; + barrier(); + x86_pmu.enable_all(); } -- cgit v1.2.2 From 452a339a976e7f782c786eb3f73080401e2fa3a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Jan 2010 23:07:48 +0100 Subject: perf_events, x86: Implement Intel Westmere support The new Intel documentation includes Westmere arch specific event maps that are significantly different from the Nehalem ones. Add support for this generation. Found the CPUID model numbers on wikipedia. Also ammend some Nehalem constraints, spotted those when looking for the differences between Nehalem and Westmere. Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Cc: "H. Peter Anvin" Cc: Stephane Eranian LKML-Reference: <20100127221122.151865645@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 124 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cf10839f20ea..3fac0bfc2dee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -244,18 +244,26 @@ static struct event_constraint intel_core_event_constraints[] = static struct event_constraint intel_nehalem_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - INTEL_EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - INTEL_EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - INTEL_EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ - INTEL_EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ EVENT_CONSTRAINT_END }; @@ -286,6 +294,97 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +static __initconst u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static __initconst u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -2423,7 +2522,9 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_core_event_constraints; pr_cont("Core2 events, "); break; - case 26: + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -2437,6 +2538,15 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("Atom events, "); break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + pr_cont("Westmere events, "); + break; default: /* * default constraints for v2 and up -- cgit v1.2.2 From 18c01f8abff51e4910cc5ffb4b710e8c6eea60c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Jan 2010 23:07:49 +0100 Subject: perf_events, x86: Remove spurious counter reset from x86_pmu_enable() At enable time the counter might still have a ->idx pointing to a previously occupied location that might now be taken by another event. Resetting the counter at that location with data from this event will destroy the other counter's count. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <20100127221122.261477183@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3fac0bfc2dee..518eb3e39577 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1762,9 +1762,6 @@ static int x86_pmu_enable(struct perf_event *event) cpuc->n_events = n; cpuc->n_added = n - n0; - if (hwc->idx != -1) - x86_perf_event_set_period(event, hwc, hwc->idx); - return 0; } -- cgit v1.2.2 From 221af7f87b97431e3ee21ce4b0e77d5411cf1549 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jan 2010 22:14:42 -0800 Subject: Split 'flush_old_exec' into two functions 'flush_old_exec()' is the point of no return when doing an execve(), and it is pretty badly misnamed. It doesn't just flush the old executable environment, it also starts up the new one. Which is very inconvenient for things like setting up the new personality, because we want the new personality to affect the starting of the new environment, but at the same time we do _not_ want the new personality to take effect if flushing the old one fails. As a result, the x86-64 '32-bit' personality is actually done using this insane "I'm going to change the ABI, but I haven't done it yet" bit (TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the personality, but just the "pending" bit, so that "flush_thread()" can do the actual personality magic. This patch in no way changes any of that insanity, but it does split the 'flush_old_exec()' function up into a preparatory part that can fail (still called flush_old_exec()), and a new part that will actually set up the new exec environment (setup_new_exec()). All callers are changed to trivially comply with the new world order. Signed-off-by: H. Peter Anvin Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2a4d073d2cf1..435d2a5323da 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -308,15 +308,17 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) return retval; - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); clear_thread_flag(TIF_ABI_PENDING); + setup_new_exec(bprm); + + regs->cs = __USER32_CS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; + current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); current->mm->end_data = ex.a_data + -- cgit v1.2.2 From 05d43ed8a89c159ff641d472f970e3f1baa66318 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 28 Jan 2010 22:14:43 -0800 Subject: x86: get rid of the insane TIF_ABI_PENDING bit Now that the previous commit made it possible to do the personality setting at the point of no return, we do just that for ELF binaries. And suddenly all the reasons for that insane TIF_ABI_PENDING bit go away, and we can just make SET_PERSONALITY() just do the obvious thing for a 32-bit compat process. Everything becomes much more straightforward this way. Signed-off-by: H. Peter Anvin Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 1 - arch/x86/include/asm/elf.h | 10 ++-------- arch/x86/include/asm/thread_info.h | 2 -- arch/x86/kernel/process.c | 12 ------------ arch/x86/kernel/process_64.c | 11 +++++++++++ 5 files changed, 13 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 435d2a5323da..f9f472462753 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -311,7 +311,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_ABI_PENDING); setup_new_exec(bprm); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index b4501ee223ad..1994d3f58443 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -181,14 +181,8 @@ do { \ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); #define compat_start_thread start_thread_ia32 -#define COMPAT_SET_PERSONALITY(ex) \ -do { \ - if (test_thread_flag(TIF_IA32)) \ - clear_thread_flag(TIF_ABI_PENDING); \ - else \ - set_thread_flag(TIF_ABI_PENDING); \ - current->personality |= force_personality32; \ -} while (0) +void set_personality_ia32(void); +#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() #define COMPAT_ELF_PLATFORM ("i686") diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 375c917c37d2..e0d28901e969 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,7 +87,6 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ -#define TIF_ABI_PENDING 19 #define TIF_MEMDIE 20 #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -112,7 +111,6 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) -#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FREEZE (1 << TIF_FREEZE) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 02c3ee013ccd..c9b3522b6b46 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -115,18 +115,6 @@ void flush_thread(void) { struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } -#endif - flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f9e033150cdf..41a26a82470a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -521,6 +521,17 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + unsigned long get_wchan(struct task_struct *p) { unsigned long stack; -- cgit v1.2.2 From 69c89efb51510b3dc0fa336f7fa257c6e1799ee4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 29 Jan 2010 11:42:20 -0800 Subject: x86, irq: Update the vector domain for legacy irqs handled by io-apic In the recent change of not reserving IRQ0_VECTOR..IRQ15_VECTOR's on all cpu's, we start with irq 0..15 getting directed to (and handled on) cpu-0. In the logical flat mode, once the AP's are online (and before irqbalance comes into picture), kernel intends to handle these IRQ's on any cpu (as the logical flat mode allows to specify multiple cpu's for the irq destination and the chipset based routing can deliver to the interrupt to any one of the specified cpu's). This was broken with our recent change, which was ending up using only cpu 0 as the destination, even when the kernel was specifying to use all online cpu's for the logical flat mode case. Fix this by updating vector allocation domain (cfg->domain) for legacy irqs, when the IO-APIC handles them. Signed-off-by: Suresh Siddha LKML-Reference: <20100129194330.207790269@sbs-t61.sc.intel.com> Tested-by: Li Zefan Cc: Yinghai Lu Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1a30587a6bc2..2430b31c9857 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1428,6 +1428,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq cfg = desc->chip_data; + /* + * For legacy irqs, cfg->domain starts with cpu 0 for legacy + * controllers like 8259. Now that IO-APIC can handle this irq, update + * the cfg->domain. + */ + if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + apic->vector_allocation_domain(0, cfg->domain); + if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.2.2 From 9d133e5db993d577bd868b54083869fe5479fcff Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 29 Jan 2010 11:42:21 -0800 Subject: x86, irq: Move __setup_vector_irq() before the first irq enable in cpu online path Lowest priority delivery of logical flat mode is broken on some systems, such that even when IO-APIC RTE says deliver the interrupt to a particular CPU, interrupt subsystem delivers the interrupt to totally different CPU. For example, this behavior was observed on a P4 based system with SiS chipset which was reported by Li Zefan. We have been handling this kind of behavior by making sure that in logical flat mode, we assign the same vector to irq mappings on all the 8 possible logical cpu's. But we have been doing this initial assignment (__setup_vector_irq()) a little late (before which interrupts were already enabled for a short duration). Move the __setup_vector_irq() before the first irq enable point in the cpu online path to avoid the issue of not handling some interrupts that wrongly hit the cpu which is still coming online. Signed-off-by: Suresh Siddha LKML-Reference: <20100129194330.283696385@sbs-t61.sc.intel.com> Tested-by: Li Zefan Cc: Yinghai Lu Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 8 +++++++- arch/x86/kernel/smpboot.c | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2430b31c9857..937150e4c06d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1256,11 +1256,16 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; struct irq_desc *desc; + /* + * vector_lock will make sure that we don't run into irq vector + * assignments that might be happening on another cpu in parallel, + * while we setup our initial vector to irq mappings. + */ + spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; @@ -1279,6 +1284,7 @@ void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } + spin_unlock(&vector_lock); } static struct irq_chip ioapic_chip; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..b2ebcba729d9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -241,6 +241,11 @@ static void __cpuinit smp_callin(void) map_cpu_to_logical_apicid(); notify_cpu_starting(cpuid); + + /* + * Need to setup vector mappings before we enable interrupts. + */ + __setup_vector_irq(smp_processor_id()); /* * Get our bogomips. * @@ -315,7 +320,6 @@ notrace static void __cpuinit start_secondary(void *unused) */ ipi_call_lock(); lock_vector_lock(); - __setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); ipi_call_unlock(); -- cgit v1.2.2 From 7c099ce1575126395f186ecf58b51a60d5c3be7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Thu, 28 Jan 2010 21:02:54 +0100 Subject: x86: Add quirk for Intel DG45FC board to avoid low memory corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 6aa542a694dc9ea4344a8a590d2628c33d1b9431 added a quirk for the Intel DG45ID board due to low memory corruption. The Intel DG45FC shares the same BIOS (and the same bug) as noted in: http://bugzilla.kernel.org/show_bug.cgi?id=13736 Signed-off-by: David Härdeman LKML-Reference: <20100128200254.GA9134@hardeman.nu> Cc: Cc: Alexey Fisher Cc: ykzhao Cc: Tony Bones Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7b8b9894b22..5d9e40c58628 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -642,19 +642,27 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, #endif {} }; -- cgit v1.2.2 From cc0967490c1c3824bc5b75718b6ca8a51d9f2617 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:42 -0600 Subject: x86, hw_breakpoints, kgdb: Fix kgdb to use hw_breakpoint API In the 2.6.33 kernel, the hw_breakpoint API is now used for the performance event counters. The hw_breakpoint_handler() now consumes the hw breakpoints that were previously set by kgdb arch specific code. In order for kgdb to work in conjunction with this core API change, kgdb must use some of the low level functions of the hw_breakpoint API to install, uninstall, and deal with hw breakpoint reservations. The kgdb core required a change to call kgdb_disable_hw_debug anytime a slave cpu enters kgdb_wait() in order to keep all the hw breakpoints in sync as well as to prevent hitting a hw breakpoint while kgdb is active. During the architecture specific initialization of kgdb, it will pre-allocate 4 disabled (struct perf event **) structures. Kgdb will use these to manage the capabilities for the 4 hw breakpoint registers, per cpu. Right now the hw_breakpoint API does not have a way to ask how many breakpoints are available, on each CPU so it is possible that the install of a breakpoint might fail when kgdb restores the system to the run state. The intent of this patch is to first get the basic functionality of hw breakpoints working and leave it to the person debugging the kernel to understand what hw breakpoints are in use and what restrictions have been imposed as a result. Breakpoint constraints will be dealt with in a future patch. While atomic, the x86 specific kgdb code will call arch_uninstall_hw_breakpoint() and arch_install_hw_breakpoint() to manage the cpu specific hw breakpoints. The net result of these changes allow kgdb to use the same pool of hw_breakpoints that are used by the perf event API, but neither knows about future reservations for the available hw breakpoint slots. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 171 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 114 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index dd74fe7273b1..62bea7307eaa 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -204,40 +205,38 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) static struct hw_breakpoint { unsigned enabled; - unsigned type; - unsigned len; unsigned long addr; + int len; + int type; + struct perf_event **pev; } breakinfo[4]; static void kgdb_correct_hw_break(void) { - unsigned long dr7; - int correctit = 0; - int breakbit; int breakno; - get_debugreg(dr7, 7); for (breakno = 0; breakno < 4; breakno++) { - breakbit = 2 << (breakno << 1); - if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { - correctit = 1; - dr7 |= breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - dr7 |= ((breakinfo[breakno].len << 2) | - breakinfo[breakno].type) << - ((breakno << 2) + 16); - set_debugreg(breakinfo[breakno].addr, breakno); - - } else { - if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { - correctit = 1; - dr7 &= ~breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - } - } + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; } - if (correctit) - set_debugreg(dr7, 7); + hw_breakpoint_restore(); } static int @@ -259,15 +258,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) static void kgdb_remove_all_hw_break(void) { int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; - for (i = 0; i < 4; i++) - memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } static int kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { - unsigned type; int i; for (i = 0; i < 4; i++) @@ -278,27 +285,38 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) switch (bptype) { case BP_HARDWARE_BREAKPOINT: - type = 0; - len = 1; + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; break; case BP_WRITE_WATCHPOINT: - type = 1; + breakinfo[i].type = X86_BREAKPOINT_WRITE; break; case BP_ACCESS_WATCHPOINT: - type = 3; + breakinfo[i].type = X86_BREAKPOINT_RW; break; default: return -1; } - - if (len == 1 || len == 2 || len == 4) - breakinfo[i].len = len - 1; - else + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: return -1; - - breakinfo[i].enabled = 1; + } breakinfo[i].addr = addr; - breakinfo[i].type = type; + breakinfo[i].enabled = 1; return 0; } @@ -313,8 +331,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) */ void kgdb_disable_hw_debug(struct pt_regs *regs) { + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } /** @@ -378,7 +409,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, struct pt_regs *linux_regs) { unsigned long addr; - unsigned long dr6; char *ptr; int newPC; @@ -404,20 +434,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, raw_smp_processor_id()); } - get_debugreg(dr6, 6); - if (!(dr6 & 0x4000)) { - int breakno; - - for (breakno = 0; breakno < 4; breakno++) { - if (dr6 & (1 << breakno) && - breakinfo[breakno].type == 0) { - /* Set restore flag: */ - linux_regs->flags |= X86_EFLAGS_RF; - break; - } - } - } - set_debugreg(0UL, 6); kgdb_correct_hw_break(); return 0; @@ -485,8 +501,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) break; case DIE_DEBUG: - if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id()) { + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) return single_step_cont(regs, args); break; @@ -539,7 +554,42 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - return register_die_notifier(&kgdb_notifier); + int i, cpu; + int ret; + struct perf_event_attr attr; + struct perf_event **pevent; + + ret = register_die_notifier(&kgdb_notifier); + if (ret != 0) + return ret; + /* + * Pre-allocate the hw breakpoint structions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.type = PERF_TYPE_BREAKPOINT; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < 4; i++) { + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + if (IS_ERR(breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return -1; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } + return ret; } /** @@ -550,6 +600,13 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } unregister_die_notifier(&kgdb_notifier); } -- cgit v1.2.2 From 5352ae638e2d7d5c9b2e4d528676bbf2af6fd6f3 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:43 -0600 Subject: perf, hw_breakpoint, kgdb: Do not take mutex for kernel debugger This patch fixes the regression in functionality where the kernel debugger and the perf API do not nicely share hw breakpoint reservations. The kernel debugger cannot use any mutex_lock() calls because it can start the kernel running from an invalid context. A mutex free version of the reservation API needed to get created for the kernel debugger to safely update hw breakpoint reservations. The possibility for a breakpoint reservation to be concurrently processed at the time that kgdb interrupts the system is improbable. Should this corner case occur the end user is warned, and the kernel debugger will prohibit updating the hardware breakpoint reservations. Any time the kernel debugger reserves a hardware breakpoint it will be a system wide reservation. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-3-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 62bea7307eaa..bfba6019d762 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -239,6 +239,49 @@ static void kgdb_correct_hw_break(void) hw_breakpoint_restore(); } +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responisble for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + static int kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { @@ -250,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) if (i == 4) return -1; + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } breakinfo[i].enabled = 0; return 0; @@ -316,6 +363,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) return -1; } breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } breakinfo[i].enabled = 1; return 0; -- cgit v1.2.2 From 3b9cfc0a99f88c0db7c72363620584a9b40b4543 Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Sun, 31 Jan 2010 20:16:34 +0100 Subject: x86, mtrr: Constify struct mtrr_ops This is part of the ops structure constification effort started by Arjan van de Ven et al. Benefits of this constification: * prevents modification of data that is shared (referenced) by many other structure instances at runtime * detects/prevents accidental (but not intentional) modification attempts on archs that enforce read-only kernel data at runtime * potentially better optimized code as the compiler can assume that the const data cannot be changed * the compiler/linker move const data into .rodata and therefore exclude them from false sharing Signed-off-by: Emese Revfy LKML-Reference: <4B65D712.3080804@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/amd.c | 2 +- arch/x86/kernel/cpu/mtrr/centaur.c | 2 +- arch/x86/kernel/cpu/mtrr/cyrix.c | 2 +- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- arch/x86/kernel/cpu/mtrr/mtrr.h | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 33af14110dfd..92ba9cd31c9a 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) return 0; } -static struct mtrr_ops amd_mtrr_ops = { +static const struct mtrr_ops amd_mtrr_ops = { .vendor = X86_VENDOR_AMD, .set = amd_set_mtrr, .get = amd_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index de89f14eff3a..316fe3e60a97 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t return 0; } -static struct mtrr_ops centaur_mtrr_ops = { +static const struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, .set = centaur_set_mcr, .get = centaur_get_mcr, diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 228d982ce09c..68a3343e5798 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -265,7 +265,7 @@ static void cyrix_set_all(void) post_set(); } -static struct mtrr_ops cyrix_mtrr_ops = { +static const struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, .set_all = cyrix_set_all, .set = cyrix_set_arr, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 55da0c5f68dd..4d755846fee6 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -752,7 +752,7 @@ int positive_have_wrcomb(void) /* * Generic structure... */ -struct mtrr_ops generic_mtrr_ops = { +const struct mtrr_ops generic_mtrr_ops = { .use_intel_if = 1, .set_all = generic_set_all, .get = generic_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 84e83de54575..fe4622e8c837 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops *mtrr_if; +const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops *ops) +void set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index a501dee9a87a..df5e41f31a27 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size, extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); -extern struct mtrr_ops generic_mtrr_ops; +extern const struct mtrr_ops generic_mtrr_ops; extern int positive_have_wrcomb(void); @@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops *ops); +extern void set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops *mtrr_if; +extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) -- cgit v1.2.2 From 1b5576e69a5fe168c08a159685ac366316ac9bbc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 22 Jan 2010 11:21:04 +0800 Subject: x86: Remove BIOS data range from e820 In preparation for moving to the generic page_is_ram(), make explicit what we expect to be reserved and not reserved. Tested-by: Wu Fengguang Signed-off-by: Yinghai Lu LKML-Reference: <20100122033004.335813103@intel.com> Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 8 ++++++++ arch/x86/kernel/setup.c | 19 ++++++++++++++++++- arch/x86/mm/ioremap.c | 16 ---------------- 3 files changed, 26 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d17d482a04f4..230687ba5ba5 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -517,11 +517,19 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype) { int i; + u64 end; u64 real_removed_size = 0; if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; + printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cdb6a8a506dd..f9b1f4e5ab74 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -650,6 +650,23 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { {} }; +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + /* + * special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + * take them out. + */ + e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -813,7 +830,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - + trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 334e63ca7b2b..30e068d6462e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -29,22 +29,6 @@ int page_is_ram(unsigned long pagenr) resource_size_t addr, end; int i; - /* - * A special case is the first 4Kb of memory; - * This is a BIOS owned area, not kernel ram, but generally - * not listed as such in the E820 table. - */ - if (pagenr == 0) - return 0; - - /* - * Second special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. - */ - if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && - pagenr < (BIOS_END >> PAGE_SHIFT)) - return 0; - for (i = 0; i < e820.nr_map; i++) { /* * Not usable memory: -- cgit v1.2.2 From 13ca0fcaa33f6b1984c4111b6ec5df42689fea6f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 22 Jan 2010 11:21:05 +0800 Subject: x86: Use the generic page_is_ram() The generic resource based page_is_ram() works better with memory hotplug/hotremove. So switch the x86 e820map based code to it. CC: Andi Kleen CC: KAMEZAWA Hiroyuki CC: Yinghai Lu Signed-off-by: Wu Fengguang LKML-Reference: <20100122033004.470767217@intel.com> Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 - arch/x86/mm/ioremap.c | 21 --------------------- 2 files changed, 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 642fe34b36a2..a667f24c7254 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -40,7 +40,6 @@ #ifndef __ASSEMBLY__ -extern int page_is_ram(unsigned long pagenr); extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 30e068d6462e..1bf9e08ed733 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -24,27 +24,6 @@ #include "physaddr.h" -int page_is_ram(unsigned long pagenr) -{ - resource_size_t addr, end; - int i; - - for (i = 0; i < e820.nr_map; i++) { - /* - * Not usable memory: - */ - if (e820.map[i].type != E820_RAM) - continue; - addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; - - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. -- cgit v1.2.2 From ab09809f2eee1dc2d8f8bea636e77d176ba6c648 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 2 Feb 2010 14:38:12 -0800 Subject: x86, doc: Fix minor spelling error in arch/x86/mm/gup.c Fix minor spelling error in comment. No code change. Signed-off-by: Andy Shevchenko LKML-Reference: <201002022238.o12McDiF018720@imap1.linux-foundation.org> Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 71da1bca13cb..738e6593799d 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -18,7 +18,7 @@ static inline pte_t gup_get_pte(pte_t *ptep) #else /* * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atoimcally, + * any locks. For this we would like to load the pointers atomically, * but that is not possible (without expensive cmpxchg8b) on PAE. What * we do have is the guarantee that a pte will only either go from not * present to present, or present to not present or both -- it will not -- cgit v1.2.2 From ea0854170c95245a258b386c7a9314399c949fe0 Mon Sep 17 00:00:00 2001 From: Shaohui Zheng Date: Tue, 2 Feb 2010 13:44:16 -0800 Subject: memory hotplug: fix a bug on /dev/mem for 64-bit kernels Newly added memory can not be accessed via /dev/mem, because we do not update the variables high_memory, max_pfn and max_low_pfn. Add a function update_end_of_memory_vars() to update these variables for 64-bit kernels. [akpm@linux-foundation.org: simplify comment] Signed-off-by: Shaohui Zheng Cc: Andi Kleen Cc: Li Haicheng Reviewed-by: Wu Fengguang Reviewed-by: KAMEZAWA Hiroyuki Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5198b9bb34ef..69ddfbd91135 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -49,6 +49,7 @@ #include #include #include +#include static unsigned long dma_reserve __initdata; @@ -615,6 +616,21 @@ void __init paging_init(void) * Memory hotplug specific functions */ #ifdef CONFIG_MEMORY_HOTPLUG +/* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + /* * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. @@ -634,6 +650,9 @@ int arch_add_memory(int nid, u64 start, u64 size) ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); -- cgit v1.2.2 From f266d7f5f89652a68e21e9882c44ee9104ad8d61 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 3 Feb 2010 21:21:32 +0200 Subject: x86_64: Print modules like i386 does Print modules list during kernel BUG. Signed-off-by: Alexey Dobriyan Cc: Arjan van de Ven Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f5..907a90e2901c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -291,6 +291,7 @@ void show_registers(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); + print_modules(); __show_regs(regs, 1); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); -- cgit v1.2.2 From 615d0ebbc782b67296e3226c293f520f93f93515 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:04 -0500 Subject: kprobes: Disable booster when CONFIG_PREEMPT=y Disable kprobe booster when CONFIG_PREEMPT=y at this time, because it can't ensure that all kernel threads preempted on kprobe's boosted slot run out from the slot even using freeze_processes(). The booster on preemptive kernel will be resumed if synchronize_tasks() or something like that is introduced. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Frederic Weisbecker Cc: Jim Keniston Cc: Mathieu Desnoyers Cc: Steven Rostedt LKML-Reference: <20100202214904.4694.24330.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5b8c7505b3bc..9453815138fa 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -429,7 +429,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) +#if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); -- cgit v1.2.2 From 2cfa19780d61740f65790c5bae363b759d7c96fa Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:11 -0500 Subject: ftrace/alternatives: Introducing *_text_reserved functions Introducing *_text_reserved functions for checking the text address range is partially reserved or not. This patch provides checking routines for x86 smp alternatives and dynamic ftrace. Since both functions modify fixed pieces of kernel text, they should reserve and protect those from other dynamic text modifier, like kprobes. This will also be extended when introducing other subsystems which modify fixed pieces of kernel text. Dynamic text modifiers should avoid those. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Mathieu Desnoyers Cc: Jason Baron LKML-Reference: <20100202214911.4694.16587.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/alternative.c | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 69b74a7b877f..ac80b7d70014 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -65,12 +65,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, void *text, void *text_end); extern void alternatives_smp_module_del(struct module *mod); extern void alternatives_smp_switch(int smp); +extern int alternatives_text_reserved(void *start, void *end); #else static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} +static inline int alternatives_text_reserved(void *start, void *end) +{ + return 0; +} #endif /* CONFIG_SMP */ /* alternative assembly primitive: */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de7353c0ce9c..3c13284ff86d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -390,6 +390,22 @@ void alternatives_smp_switch(int smp) mutex_unlock(&smp_alt); } +/* Return 1 if the address range is reserved for smp-alternatives */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + u8 **ptr; + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > end || mod->text_end < start) + continue; + for (ptr = mod->locks; ptr < mod->locks_end; ptr++) + if (start <= *ptr && end >= *ptr) + return 1; + } + + return 0; +} #endif #ifdef CONFIG_PARAVIRT -- cgit v1.2.2 From 4554dbcb85a4ed2abaa2b6fa15649b796699ec89 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:18 -0500 Subject: kprobes: Check probe address is reserved Check whether the address of new probe is already reserved by ftrace or alternatives (on x86) when registering new probe. If reserved, it returns an error and not register the probe. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Mathieu Desnoyers Cc: Jason Baron LKML-Reference: <20100202214918.4694.94179.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 9453815138fa..5de9f4a9c3fd 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -337,6 +337,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + if (!can_probe((unsigned long)p->addr)) return -EILSEQ; /* insn: must be on special executable page on x86. */ -- cgit v1.2.2 From 8c48e444191de0ff84e85d41180d7bc3e74f14ef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Jan 2010 13:25:31 +0100 Subject: perf_events, x86: Implement intel core solo/duo support Implement Intel Core Solo/Duo, aka. Intel Architectural Performance Monitoring Version 1. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Arjan van de Ven LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 133 ++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 72 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1846ead0576b..5b91992b6b25 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -227,6 +227,17 @@ static const u64 intel_perfmon_event_map[] = }; static struct event_constraint intel_core_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ @@ -1216,7 +1227,7 @@ static void intel_pmu_disable_all(void) intel_pmu_disable_bts(); } -static void amd_pmu_disable_all(void) +static void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -1226,11 +1237,11 @@ static void amd_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + rdmsrl(x86_pmu.eventsel + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -1278,7 +1289,7 @@ static void intel_pmu_enable_all(void) } } -static void amd_pmu_enable_all(void) +static void x86_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -1292,7 +1303,7 @@ static void amd_pmu_enable_all(void) val = event->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -1546,7 +1557,7 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); @@ -1598,12 +1609,6 @@ intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) x86_pmu_disable_event(hwc, idx); } -static inline void -amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - x86_pmu_disable_event(hwc, idx); -} - static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* @@ -1723,15 +1728,14 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) return; } - x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(hwc, idx); } -static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (cpuc->enabled) - x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(hwc, idx); } /* @@ -1988,50 +1992,6 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } -static int p6_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - struct perf_event *event; - struct hw_perf_event *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - data.raw = NULL; - - cpuc = &__get_cpu_var(cpu_hw_events); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - event = cpuc->events[idx]; - hwc = &event->hw; - - val = x86_perf_event_update(event, hwc, idx); - if (val & (1ULL << (x86_pmu.event_bits - 1))) - continue; - - /* - * event overflow - */ - handled = 1; - data.period = event->hw.last_period; - - if (!x86_perf_event_set_period(event, hwc, idx)) - continue; - - if (perf_event_overflow(event, 1, &data, regs)) - p6_pmu_disable_event(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -2098,7 +2058,7 @@ again: return 1; } -static int amd_pmu_handle_irq(struct pt_regs *regs) +static int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; @@ -2133,7 +2093,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - amd_pmu_disable_event(hwc, idx); + x86_pmu.disable(hwc, idx); } if (handled) @@ -2374,7 +2334,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { static __initconst struct x86_pmu p6_pmu = { .name = "p6", - .handle_irq = p6_pmu_handle_irq, + .handle_irq = x86_pmu_handle_irq, .disable_all = p6_pmu_disable_all, .enable_all = p6_pmu_enable_all, .enable = p6_pmu_enable_event, @@ -2401,6 +2361,29 @@ static __initconst struct x86_pmu p6_pmu = { .event_constraints = intel_p6_event_constraints }; +static __initconst struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_core_event_constraints, +}; + static __initconst struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -2427,11 +2410,11 @@ static __initconst struct x86_pmu intel_pmu = { static __initconst struct x86_pmu amd_pmu = { .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_event, - .disable = amd_pmu_disable_event, + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, @@ -2498,9 +2481,10 @@ static __init int intel_pmu_init(void) version = eax.split.version_id; if (version < 2) - return -ENODEV; + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; - x86_pmu = intel_pmu; x86_pmu.version = version; x86_pmu.num_events = eax.split.num_events; x86_pmu.event_bits = eax.split.bit_width; @@ -2510,12 +2494,17 @@ static __init int intel_pmu_init(void) * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: */ - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + if (version > 1) + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); /* * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ @@ -2523,7 +2512,7 @@ static __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - x86_pmu.event_constraints = intel_core_event_constraints; + x86_pmu.event_constraints = intel_core2_event_constraints; pr_cont("Core2 events, "); break; -- cgit v1.2.2 From fce877e3a429940a986e085a41e8b57f2d922e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Jan 2010 13:25:12 +0100 Subject: bitops: Ensure the compile time HWEIGHT is only used for such Avoid accidental misuse by failing to compile things Suggested-by: Andrew Morton Signed-off-by: Peter Zijlstra Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5b91992b6b25..96cfc1a4fe9f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,13 +93,16 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ +#define __EVENT_CONSTRAINT(c, n, m, w) {\ { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ - .weight = HWEIGHT64((u64)(n)), \ + .weight = (w), \ } +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) + #define INTEL_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) @@ -2622,7 +2625,8 @@ void __init init_hw_perf_events(void) register_die_notifier(&perf_event_nmi_notifier); unconstrained = (struct event_constraint) - EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0); + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, + 0, x86_pmu.num_events); pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); -- cgit v1.2.2 From 447a194b393f32699607fd99617a40abd6a95114 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 1 Feb 2010 14:50:01 +0200 Subject: perf_events, x86: Fix bug in hw_perf_enable() We cannot assume that because hwc->idx == assign[i], we can avoid reprogramming the counter in hw_perf_enable(). The event may have been scheduled out and another event may have been programmed into this counter. Thus, we need a more robust way of verifying if the counter still contains config/data related to an event. This patch adds a generation number to each counter on each cpu. Using this mechanism we can verify reliabilty whether the content of a counter corresponds to an event. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4b66dc67.0b38560a.1635.ffffae18@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 96cfc1a4fe9f..a920f173a220 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -90,6 +90,7 @@ struct cpu_hw_events { int n_events; int n_added; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; @@ -1142,6 +1143,8 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->config = ARCH_PERFMON_EVENTSEL_INT; hwc->idx = -1; + hwc->last_cpu = -1; + hwc->last_tag = ~0ULL; /* * Count user and OS events unless requested not to. @@ -1457,11 +1460,14 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, return n; } - static inline void x86_assign_hw_event(struct perf_event *event, - struct hw_perf_event *hwc, int idx) + struct cpu_hw_events *cpuc, int i) { - hwc->idx = idx; + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; @@ -1480,6 +1486,15 @@ static inline void x86_assign_hw_event(struct perf_event *event, } } +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) +{ + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; +} + static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc); void hw_perf_enable(void) @@ -1508,7 +1523,14 @@ void hw_perf_enable(void) event = cpuc->event_list[i]; hwc = &event->hw; - if (hwc->idx == -1 || hwc->idx == cpuc->assign[i]) + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) continue; __x86_pmu_disable(event, cpuc); @@ -1522,12 +1544,12 @@ void hw_perf_enable(void) hwc = &event->hw; if (hwc->idx == -1) { - x86_assign_hw_event(event, hwc, cpuc->assign[i]); + x86_assign_hw_event(event, cpuc, i); x86_perf_event_set_period(event, hwc, hwc->idx); } /* * need to mark as active because x86_pmu_disable() - * clear active_mask and eventsp[] yet it preserves + * clear active_mask and events[] yet it preserves * idx */ set_bit(hwc->idx, cpuc->active_mask); -- cgit v1.2.2 From 34d2819f20782feb60f9434470ecfb200875fd41 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Feb 2010 09:51:28 +0100 Subject: x86, mtrr: Remove unused mtrr/state.c The last reference to the helpers in went away with 9a6b344ea967efa0bb5ca4cb5405f840652b66c4 leaving unused code. Remove it. Signed-off-by: Borislav Petkov LKML-Reference: <20100204085128.GA513@liondog.tnic> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/Makefile | 2 +- arch/x86/kernel/cpu/mtrr/state.c | 94 --------------------------------------- 2 files changed, 1 insertion(+), 95 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mtrr/state.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index f4361b56f8e9..ad9e5ed81181 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o state.o cleanup.o +obj-y := main.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c deleted file mode 100644 index dfc80b4e6b0d..000000000000 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ /dev/null @@ -1,94 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include "mtrr.h" - -/* Put the processor into a state where MTRRs can be safely set */ -void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) -{ - unsigned int cr0; - - /* Disable interrupts locally */ - local_irq_save(ctxt->flags); - - if (use_intel() || is_cpu(CYRIX)) { - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { - ctxt->cr4val = read_cr4(); - write_cr4(ctxt->cr4val & ~X86_CR4_PGE); - } - - /* - * Disable and flush caches. Note that wbinvd flushes the TLBs - * as a side-effect - */ - cr0 = read_cr0() | X86_CR0_CD; - wbinvd(); - write_cr0(cr0); - wbinvd(); - - if (use_intel()) { - /* Save MTRR state */ - rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else were excluded at the top - */ - ctxt->ccr3 = getCx86(CX86_CCR3); - } - } -} - -void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) -{ - if (use_intel()) { - /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, - ctxt->deftype_hi); - } else { - if (is_cpu(CYRIX)) { - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); - } - } -} - -/* Restore the processor after a set_mtrr_prepare */ -void set_mtrr_done(struct set_mtrr_context *ctxt) -{ - if (use_intel() || is_cpu(CYRIX)) { - - /* Flush caches and TLBs */ - wbinvd(); - - /* Restore MTRRdefType */ - if (use_intel()) { - /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, - ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else was excluded at the top - */ - setCx86(CX86_CCR3, ctxt->ccr3); - } - - /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); - - /* Restore value of CR4 */ - if (cpu_has_pge) - write_cr4(ctxt->cr4val); - } - /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore(ctxt->flags); -} -- cgit v1.2.2 From 5d93a14241bf5ba299422440bc366ec43970c002 Mon Sep 17 00:00:00 2001 From: Shaun Patterson Date: Sat, 5 Dec 2009 22:30:52 -0500 Subject: vmiclock: fix comment spelling mistake Signed-off-by: Shaun Patterson Signed-off-by: Jiri Kosina --- arch/x86/kernel/vmiclock_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 74c92bb194df..25bbb9bfc312 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -171,7 +171,7 @@ static int vmi_timer_next_event(unsigned long delta, { /* Unfortunately, set_next_event interface only passes relative * expiry, but we want absolute expiry. It'd be better if were - * were passed an aboslute expiry, since a bunch of time may + * were passed an absolute expiry, since a bunch of time may * have been stolen between the time the delta is computed and * when we set the alarm below. */ cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); -- cgit v1.2.2 From e34b7005e5f55a55964c13ec9784e8e2b427a83c Mon Sep 17 00:00:00 2001 From: Jasper Spaans Date: Fri, 20 Nov 2009 14:20:05 +0100 Subject: arch/x86/kernel/apic/apic_flat_64.c: Make comment match the code Make the comment match the code, this also holds for intel systems, according to probe_64.c in the same directory. Signed-off-by: Jasper Spaans Signed-off-by: Jiri Kosina --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index e3c3d820c325..09d3b17ce0c2 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -223,7 +223,7 @@ struct apic apic_flat = { }; /* - * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * Physflat mode is used when there are more than 8 CPUs on a system. * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ -- cgit v1.2.2 From c9404c9c392d557a4687c4cbda022b03cb787ce9 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 18 Dec 2009 15:40:42 -0500 Subject: Fix misspelling of "should" and "shouldn't" in comments. Some comments misspell "should" or "shouldn't"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..118428085ea2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -604,7 +604,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct perf_event_attr attr; /* - * We shoud have at least an inactive breakpoint at this + * We should have at least an inactive breakpoint at this * slot. It means the user is writing dr7 without having * written the address register first */ -- cgit v1.2.2 From 71709247aa852b5c4a01e70a9186590800d15575 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Mon, 28 Dec 2009 11:50:29 -0500 Subject: xen: Fix misspelled CONFIG variable in comment. Signed-off-by: Robert P. J. Day Signed-off-by: Jiri Kosina --- arch/x86/xen/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 563d20504988..deafb65ef44e 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -361,7 +361,7 @@ static void xen_cpu_die(unsigned int cpu) alternatives_smp_switch(0); } -static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */ +static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); -- cgit v1.2.2 From fb637f3cd31783db2b654842ea32ffec15c4bd62 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Thu, 14 Jan 2010 22:16:16 -0800 Subject: fix comment typo in pci-dma.c Signed-off-by: Justin P. Mattock Signed-off-by: Jiri Kosina --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 75e14e21f61a..eec33a7d96a0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -38,7 +38,7 @@ int iommu_detected __read_mostly = 0; * This variable becomes 1 if iommu=pt is passed on the kernel command line. * If this variable is 1, IOMMU implementations do no DMA translation for * devices and allow every device to access to whole physical memory. This is - * useful if a user want to use an IOMMU only for KVM device assignment to + * useful if a user wants to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. */ int iommu_pass_through __read_mostly; -- cgit v1.2.2 From 17622339af2536b32cf29699ddd4ba0fe79a61d5 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 2 Feb 2010 14:41:39 -0800 Subject: clocksource: add argument to resume callback Pass the clocksource as an argument to the clocksource resume callback. Needed so we can point out which CMT channel the sh_cmt.c driver shall resume. Signed-off-by: Magnus Damm Cc: john stultz Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/tsc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad80a1c718c6..ee4fa1bfcb33 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -266,7 +266,7 @@ static void hpet_resume_device(void) force_hpet_resume(); } -static void hpet_resume_counter(void) +static void hpet_resume_counter(struct clocksource *cs) { hpet_resume_device(); hpet_restart_counter(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba0..9eeb9be26aa4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif -static void resume_tsc(void) +static void resume_tsc(struct clocksource *cs) { clocksource_tsc.cycle_last = 0; } -- cgit v1.2.2 From 5c64c7019e571a726f4aa9c1896402c15391a8ed Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 5 Feb 2010 09:37:03 -0500 Subject: x86-32: Move XQUAD definitions to numaq.h The XQUAD stuff is part of the NUMAQ architecture, so move it there. Signed-off-by: Brian Gerst LKML-Reference: <1265380629-3212-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_32.h | 3 --- arch/x86/include/asm/numaq.h | 4 ++++ arch/x86/pci/numaq_32.c | 6 +----- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h index a299900f5920..e16b9dbef81a 100644 --- a/arch/x86/include/asm/io_32.h +++ b/arch/x86/include/asm/io_32.h @@ -37,9 +37,6 @@ * - Arnaldo Carvalho de Melo */ -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ - #ifdef __KERNEL__ #include diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 9f0a5f5d29ec..13370b95ea94 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -33,6 +33,10 @@ extern int get_memcfg_numaq(void); extern void *xquad_portio; +#define XQUAD_PORTIO_BASE 0xfe400000 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) + /* * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the */ diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8eb295e116f6..8884a1c1ada6 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -8,9 +8,7 @@ #include #include #include - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ +#include #define BUS2QUAD(global) (mp_bus_id_to_node[global]) @@ -18,8 +16,6 @@ #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) - #define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) -- cgit v1.2.2 From bd2984e96452855d148ebce76f696dcecbc96340 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 5 Feb 2010 09:37:04 -0500 Subject: x86-32: Remove _local variants of in/out from io_32.h These were leftover from the numaq support that was removed in commit 1fba38703d0ce8a5ff0fad9df3eccc6b55cf2cfb. Signed-off-by: Brian Gerst LKML-Reference: <1265380629-3212-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_32.h | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h index e16b9dbef81a..72a6a4a930ae 100644 --- a/arch/x86/include/asm/io_32.h +++ b/arch/x86/include/asm/io_32.h @@ -120,47 +120,21 @@ static inline void slow_down_io(void) #endif -#define __BUILDIO(bwl, bw, type) \ -static inline void out##bwl(unsigned type value, int port) \ -{ \ - out##bwl##_local(value, port); \ -} \ - \ -static inline unsigned type in##bwl(int port) \ -{ \ - return in##bwl##_local(port); \ -} - #define BUILDIO(bwl, bw, type) \ -static inline void out##bwl##_local(unsigned type value, int port) \ +static inline void out##bwl(unsigned type value, int port) \ { \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ -static inline unsigned type in##bwl##_local(int port) \ +static inline unsigned type in##bwl(int port) \ { \ unsigned type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ : "=a"(value) : "Nd"(port)); \ return value; \ } \ \ -static inline void out##bwl##_local_p(unsigned type value, int port) \ -{ \ - out##bwl##_local(value, port); \ - slow_down_io(); \ -} \ - \ -static inline unsigned type in##bwl##_local_p(int port) \ -{ \ - unsigned type value = in##bwl##_local(port); \ - slow_down_io(); \ - return value; \ -} \ - \ -__BUILDIO(bwl, bw, type) \ - \ static inline void out##bwl##_p(unsigned type value, int port) \ { \ out##bwl(value, port); \ -- cgit v1.2.2 From 2e16fc7728a77755b5b2dc6b27dde62cd97b9ea5 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 5 Feb 2010 09:37:05 -0500 Subject: x86-64: Reorganize io_64.h Make it more similar to io_32.h. No real code changes. Signed-off-by: Brian Gerst LKML-Reference: <1265380629-3212-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_64.h | 90 ++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h index 244067893af4..040bf74d717d 100644 --- a/arch/x86/include/asm/io_64.h +++ b/arch/x86/include/asm/io_64.h @@ -35,6 +35,54 @@ * - Arnaldo Carvalho de Melo */ +#ifdef __KERNEL__ + +#include + +#include + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + +void memset_io(volatile void __iomem *a, int b, size_t c); + +void __memcpy_fromio(void *, unsigned long, unsigned); +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, + unsigned len) +{ + __memcpy_fromio(to, (unsigned long)from, len); +} + +void __memcpy_toio(unsigned long, const void *, unsigned); +static inline void memcpy_toio(volatile void __iomem *to, const void *from, + unsigned len) +{ + __memcpy_toio((unsigned long)to, from, len); +} + +/* + * ISA space is 'always mapped' on a typical x86 system, no need to + * explicitly ioremap() it. The fact that the ISA IO space is mapped + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values + * are physical addresses. The following constant pointer can be + * used as the IO-area pointer (it can be iounmapped as well, so the + * analogy with PCI is quite large): + */ +#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) + +/* + * Cache management + * + * This needed for two cases + * 1. Out of order aware processors + * 2. Accidentally out of order processors (PPro errata #51) + */ +#define flush_write_buffers() do { } while (0) + +#endif /* __KERNEL__ */ + extern void native_io_delay(void); extern int io_delay_type; @@ -53,6 +101,7 @@ static inline void slow_down_io(void) native_io_delay(); #endif } + #endif /* @@ -136,46 +185,5 @@ __OUTS(b) __OUTS(w) __OUTS(l) -#if defined(__KERNEL__) && defined(__x86_64__) - -#include - -#include - -void __memcpy_fromio(void *, unsigned long, unsigned); -void __memcpy_toio(unsigned long, const void *, unsigned); - -static inline void memcpy_fromio(void *to, const volatile void __iomem *from, - unsigned len) -{ - __memcpy_fromio(to, (unsigned long)from, len); -} - -static inline void memcpy_toio(volatile void __iomem *to, const void *from, - unsigned len) -{ - __memcpy_toio((unsigned long)to, from, len); -} - -void memset_io(volatile void __iomem *a, int b, size_t c); - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -#define flush_write_buffers() - -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -#endif /* __KERNEL__ */ #endif /* _ASM_X86_IO_64_H */ -- cgit v1.2.2 From 2b4df4d4f7de1a834d252c7da3197fce634cbf0e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 5 Feb 2010 09:37:06 -0500 Subject: x86-64: Use BUILDIO in io_64.h Copied from io_32.h. Signed-off-by: Brian Gerst LKML-Reference: <1265380629-3212-5-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_64.h | 112 ++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 75 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h index 040bf74d717d..4a94aef5acf1 100644 --- a/arch/x86/include/asm/io_64.h +++ b/arch/x86/include/asm/io_64.h @@ -104,86 +104,48 @@ static inline void slow_down_io(void) #endif -/* - * Talk about misusing macros.. - */ -#define __OUT1(s, x) \ -static inline void out##s(unsigned x value, unsigned short port) { - -#define __OUT2(s, s1, s2) \ -asm volatile ("out" #s " %" s1 "0,%" s2 "1" - -#ifndef REALLY_SLOW_IO -#define REALLY_SLOW_IO -#define UNSET_REALLY_SLOW_IO -#endif - -#define __OUT(s, s1, x) \ - __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ - } \ - __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ +#define BUILDIO(bwl, bw, type) \ +static inline void out##bwl(unsigned type value, int port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline unsigned type in##bwl(int port) \ +{ \ + unsigned type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} \ + \ +static inline void out##bwl##_p(unsigned type value, int port) \ +{ \ + out##bwl(value, port); \ slow_down_io(); \ -} - -#define __IN1(s) \ -static inline RETURN_TYPE in##s(unsigned short port) \ +} \ + \ +static inline unsigned type in##bwl##_p(int port) \ { \ - RETURN_TYPE _v; - -#define __IN2(s, s1, s2) \ - asm volatile ("in" #s " %" s2 "1,%" s1 "0" - -#define __IN(s, s1, i...) \ - __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ - return _v; \ - } \ - __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ - slow_down_io(); \ - return _v; } - -#ifdef UNSET_REALLY_SLOW_IO -#undef REALLY_SLOW_IO -#endif - -#define __INS(s) \ -static inline void ins##s(unsigned short port, void *addr, \ - unsigned long count) \ + unsigned type value = in##bwl(port); \ + slow_down_io(); \ + return value; \ +} \ + \ +static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ - asm volatile ("rep ; ins" #s \ - : "=D" (addr), "=c" (count) \ - : "d" (port), "0" (addr), "1" (count)); \ -} - -#define __OUTS(s) \ -static inline void outs##s(unsigned short port, const void *addr, \ - unsigned long count) \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) : "d"(port)); \ +} \ + \ +static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ - asm volatile ("rep ; outs" #s \ - : "=S" (addr), "=c" (count) \ - : "d" (port), "0" (addr), "1" (count)); \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) : "d"(port)); \ } -#define RETURN_TYPE unsigned char -__IN(b, "") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned short -__IN(w, "") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned int -__IN(l, "") -#undef RETURN_TYPE - -__OUT(b, "b", char) -__OUT(w, "w", short) -__OUT(l, , int) - -__INS(b) -__INS(w) -__INS(l) - -__OUTS(b) -__OUTS(w) -__OUTS(l) - +BUILDIO(b, b, char) +BUILDIO(w, w, short) +BUILDIO(l, , int) #endif /* _ASM_X86_IO_64_H */ -- cgit v1.2.2 From 6175ddf06b6172046a329e3abfd9c901a43efd2e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 5 Feb 2010 09:37:07 -0500 Subject: x86: Clean up mem*io functions. Iomem has no special significance on x86. Use the standard mem* functions instead of trying to call other versions. Some fixups are needed to match the function prototypes. Signed-off-by: Brian Gerst LKML-Reference: <1265380629-3212-6-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 13 ++++--------- arch/x86/include/asm/io_32.h | 10 +++++----- arch/x86/include/asm/io_64.h | 22 +++++++++++++--------- arch/x86/lib/Makefile | 2 +- arch/x86/lib/io_64.c | 25 ------------------------- 5 files changed, 23 insertions(+), 49 deletions(-) delete mode 100644 arch/x86/lib/io_64.c (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 3b22fe8ab91b..88042e812d3c 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -19,11 +19,6 @@ #define _ASM_X86_DESC_H 1 #endif -#ifdef CONFIG_X86_64 -#define _LINUX_STRING_H_ 1 -#define __LINUX_BITMAP_H 1 -#endif - #include #include #include @@ -131,8 +126,8 @@ static void error(char *m); static struct boot_params *real_mode; /* Pointer to real-mode data */ static int quiet; -static void *memset(void *s, int c, unsigned n); -void *memcpy(void *dest, const void *src, unsigned n); +void *memset(void *s, int c, size_t n); +void *memcpy(void *dest, const void *src, size_t n); static void __putstr(int, const char *); #define putstr(__x) __putstr(0, __x) @@ -223,7 +218,7 @@ static void __putstr(int error, const char *s) outb(0xff & (pos >> 1), vidport+1); } -static void *memset(void *s, int c, unsigned n) +void *memset(void *s, int c, size_t n) { int i; char *ss = s; @@ -233,7 +228,7 @@ static void *memset(void *s, int c, unsigned n) return s; } -void *memcpy(void *dest, const void *src, unsigned n) +void *memcpy(void *dest, const void *src, size_t n) { int i; const char *s = src; diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h index 72a6a4a930ae..685e33293468 100644 --- a/arch/x86/include/asm/io_32.h +++ b/arch/x86/include/asm/io_32.h @@ -49,21 +49,21 @@ #define xlate_dev_kmem_ptr(p) p static inline void -memset_io(volatile void __iomem *addr, unsigned char val, int count) +memset_io(volatile void __iomem *addr, unsigned char val, size_t count) { memset((void __force *)addr, val, count); } static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, int count) +memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) { - __memcpy(dst, (const void __force *)src, count); + memcpy(dst, (const void __force *)src, count); } static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, int count) +memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) { - __memcpy((void __force *)dst, src, count); + memcpy((void __force *)dst, src, count); } /* diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h index 4a94aef5acf1..1305525813fc 100644 --- a/arch/x86/include/asm/io_64.h +++ b/arch/x86/include/asm/io_64.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_IO_64_H #define _ASM_X86_IO_64_H +#include +#include /* * This file contains the definitions for the x86 IO instructions @@ -46,20 +48,22 @@ */ #define xlate_dev_kmem_ptr(p) p -void memset_io(volatile void __iomem *a, int b, size_t c); +static inline void +memset_io(volatile void __iomem *addr, unsigned char val, size_t count) +{ + memset((void __force *)addr, val, count); +} -void __memcpy_fromio(void *, unsigned long, unsigned); -static inline void memcpy_fromio(void *to, const volatile void __iomem *from, - unsigned len) +static inline void +memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) { - __memcpy_fromio(to, (unsigned long)from, len); + memcpy(dst, (const void __force *)src, count); } -void __memcpy_toio(unsigned long, const void *, unsigned); -static inline void memcpy_toio(volatile void __iomem *to, const void *from, - unsigned len) +static inline void +memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) { - __memcpy_toio((unsigned long)to, from, len); + memcpy((void __force *)dst, src, count); } /* diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index cffd754f3039..fff14272dbad 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -34,7 +34,7 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y) endif lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else - obj-y += io_64.o iomap_copy_64.o + obj-y += iomap_copy_64.o lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c deleted file mode 100644 index 3f1eb59b5f08..000000000000 --- a/arch/x86/lib/io_64.c +++ /dev/null @@ -1,25 +0,0 @@ -#include -#include -#include - -void __memcpy_toio(unsigned long dst, const void *src, unsigned len) -{ - __inline_memcpy((void *)dst, src, len); -} -EXPORT_SYMBOL(__memcpy_toio); - -void __memcpy_fromio(void *dst, unsigned long src, unsigned len) -{ - __inline_memcpy(dst, (const void *)src, len); -} -EXPORT_SYMBOL(__memcpy_fromio); - -void memset_io(volatile void __iomem *a, int b, size_t c) -{ - /* - * TODO: memset can mangle the IO patterns quite a bit. - * perhaps it would be better to use a dumb one: - */ - memset((void *)a, b, c); -} -EXPORT_SYMBOL(memset_io); -- cgit v1.2.2 From 910bf6ad0be3e1efbda0e9d358794937b52c9860 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 5 Feb 2010 09:37:08 -0500 Subject: x86: Simplify flush_write_buffers() Always make it an inline instead of using a macro for the no-op case. Signed-off-by: Brian Gerst LKML-Reference: <1265380629-3212-7-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_32.h | 10 ++-------- arch/x86/include/asm/io_64.h | 8 +++++++- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h index 685e33293468..e8177f3b87f6 100644 --- a/arch/x86/include/asm/io_32.h +++ b/arch/x86/include/asm/io_32.h @@ -84,18 +84,12 @@ memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) * 2. Accidentally out of order processors (PPro errata #51) */ -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) - static inline void flush_write_buffers(void) { +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) asm volatile("lock; addl $0,0(%%esp)": : :"memory"); -} - -#else - -#define flush_write_buffers() do { } while (0) - #endif +} #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h index 1305525813fc..6964a1c366d3 100644 --- a/arch/x86/include/asm/io_64.h +++ b/arch/x86/include/asm/io_64.h @@ -83,7 +83,13 @@ memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) * 1. Out of order aware processors * 2. Accidentally out of order processors (PPro errata #51) */ -#define flush_write_buffers() do { } while (0) + +static inline void flush_write_buffers(void) +{ +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + asm volatile("lock; addl $0,0(%%esp)": : :"memory"); +#endif +} #endif /* __KERNEL__ */ -- cgit v1.2.2 From 1c5b9069e12e20d2fe883076ae0bf73966492108 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 5 Feb 2010 09:37:09 -0500 Subject: x86: Merge io.h io_32.h and io_64.h are now identical. Merge them into io.h. Signed-off-by: Brian Gerst LKML-Reference: <1265380629-3212-8-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io.h | 155 ++++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/io_32.h | 161 ------------------------------------------- arch/x86/include/asm/io_64.h | 161 ------------------------------------------- 3 files changed, 152 insertions(+), 325 deletions(-) delete mode 100644 arch/x86/include/asm/io_32.h delete mode 100644 arch/x86/include/asm/io_64.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 73739322b6d0..a1dcfa3ab17d 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -1,8 +1,42 @@ #ifndef _ASM_X86_IO_H #define _ASM_X86_IO_H +/* + * This file contains the definitions for the x86 IO instructions + * inb/inw/inl/outb/outw/outl and the "string versions" of the same + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" + * versions of the single-IO instructions (inb_p/inw_p/..). + * + * This file is not meant to be obfuscating: it's just complicated + * to (a) handle it all in a way that makes gcc able to optimize it + * as well as possible and (b) trying to avoid writing the same thing + * over and over again with slight variations and possibly making a + * mistake somewhere. + */ + +/* + * Thanks to James van Artsdalen for a better timing-fix than + * the two short jumps: using outb's to a nonexistent port seems + * to guarantee better timings even on fast machines. + * + * On the other hand, I'd like to be sure of a non-existent port: + * I feel a bit unsafe about using 0x80 (should be safe, though) + * + * Linus + */ + + /* + * Bit simplified and optimized by Jan Hubicka + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. + * + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, + * isa_read[wl] and isa_write[wl] fixed + * - Arnaldo Carvalho de Melo + */ + #define ARCH_HAS_IOREMAP_WC +#include #include #include #include @@ -173,11 +207,126 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); -#ifdef CONFIG_X86_32 -# include "io_32.h" +#ifdef __KERNEL__ + +#include + +#include + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + +static inline void +memset_io(volatile void __iomem *addr, unsigned char val, size_t count) +{ + memset((void __force *)addr, val, count); +} + +static inline void +memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) +{ + memcpy(dst, (const void __force *)src, count); +} + +static inline void +memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) +{ + memcpy((void __force *)dst, src, count); +} + +/* + * ISA space is 'always mapped' on a typical x86 system, no need to + * explicitly ioremap() it. The fact that the ISA IO space is mapped + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values + * are physical addresses. The following constant pointer can be + * used as the IO-area pointer (it can be iounmapped as well, so the + * analogy with PCI is quite large): + */ +#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) + +/* + * Cache management + * + * This needed for two cases + * 1. Out of order aware processors + * 2. Accidentally out of order processors (PPro errata #51) + */ + +static inline void flush_write_buffers(void) +{ +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + asm volatile("lock; addl $0,0(%%esp)": : :"memory"); +#endif +} + +#endif /* __KERNEL__ */ + +extern void native_io_delay(void); + +extern int io_delay_type; +extern void io_delay_init(void); + +#if defined(CONFIG_PARAVIRT) +#include #else -# include "io_64.h" + +static inline void slow_down_io(void) +{ + native_io_delay(); +#ifdef REALLY_SLOW_IO + native_io_delay(); + native_io_delay(); + native_io_delay(); #endif +} + +#endif + +#define BUILDIO(bwl, bw, type) \ +static inline void out##bwl(unsigned type value, int port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline unsigned type in##bwl(int port) \ +{ \ + unsigned type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} \ + \ +static inline void out##bwl##_p(unsigned type value, int port) \ +{ \ + out##bwl(value, port); \ + slow_down_io(); \ +} \ + \ +static inline unsigned type in##bwl##_p(int port) \ +{ \ + unsigned type value = in##bwl(port); \ + slow_down_io(); \ + return value; \ +} \ + \ +static inline void outs##bwl(int port, const void *addr, unsigned long count) \ +{ \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) : "d"(port)); \ +} \ + \ +static inline void ins##bwl(int port, void *addr, unsigned long count) \ +{ \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) : "d"(port)); \ +} + +BUILDIO(b, b, char) +BUILDIO(w, w, short) +BUILDIO(l, , int) extern void *xlate_dev_mem_ptr(unsigned long phys); extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h deleted file mode 100644 index e8177f3b87f6..000000000000 --- a/arch/x86/include/asm/io_32.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef _ASM_X86_IO_32_H -#define _ASM_X86_IO_32_H - -#include -#include - -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo - */ - -#ifdef __KERNEL__ - -#include - -#include - -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -/* - * Cache management - * - * This needed for two cases - * 1. Out of order aware processors - * 2. Accidentally out of order processors (PPro errata #51) - */ - -static inline void flush_write_buffers(void) -{ -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) - asm volatile("lock; addl $0,0(%%esp)": : :"memory"); -#endif -} - -#endif /* __KERNEL__ */ - -extern void native_io_delay(void); - -extern int io_delay_type; -extern void io_delay_init(void); - -#if defined(CONFIG_PARAVIRT) -#include -#else - -static inline void slow_down_io(void) -{ - native_io_delay(); -#ifdef REALLY_SLOW_IO - native_io_delay(); - native_io_delay(); - native_io_delay(); -#endif -} - -#endif - -#define BUILDIO(bwl, bw, type) \ -static inline void out##bwl(unsigned type value, int port) \ -{ \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ - : : "a"(value), "Nd"(port)); \ -} \ - \ -static inline unsigned type in##bwl(int port) \ -{ \ - unsigned type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ - : "=a"(value) : "Nd"(port)); \ - return value; \ -} \ - \ -static inline void out##bwl##_p(unsigned type value, int port) \ -{ \ - out##bwl(value, port); \ - slow_down_io(); \ -} \ - \ -static inline unsigned type in##bwl##_p(int port) \ -{ \ - unsigned type value = in##bwl(port); \ - slow_down_io(); \ - return value; \ -} \ - \ -static inline void outs##bwl(int port, const void *addr, unsigned long count) \ -{ \ - asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ -} \ - \ -static inline void ins##bwl(int port, void *addr, unsigned long count) \ -{ \ - asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ -} - -BUILDIO(b, b, char) -BUILDIO(w, w, short) -BUILDIO(l, , int) - -#endif /* _ASM_X86_IO_32_H */ diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h deleted file mode 100644 index 6964a1c366d3..000000000000 --- a/arch/x86/include/asm/io_64.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef _ASM_X86_IO_64_H -#define _ASM_X86_IO_64_H - -#include -#include - -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo - */ - -#ifdef __KERNEL__ - -#include - -#include - -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -/* - * Cache management - * - * This needed for two cases - * 1. Out of order aware processors - * 2. Accidentally out of order processors (PPro errata #51) - */ - -static inline void flush_write_buffers(void) -{ -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) - asm volatile("lock; addl $0,0(%%esp)": : :"memory"); -#endif -} - -#endif /* __KERNEL__ */ - -extern void native_io_delay(void); - -extern int io_delay_type; -extern void io_delay_init(void); - -#if defined(CONFIG_PARAVIRT) -#include -#else - -static inline void slow_down_io(void) -{ - native_io_delay(); -#ifdef REALLY_SLOW_IO - native_io_delay(); - native_io_delay(); - native_io_delay(); -#endif -} - -#endif - -#define BUILDIO(bwl, bw, type) \ -static inline void out##bwl(unsigned type value, int port) \ -{ \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ - : : "a"(value), "Nd"(port)); \ -} \ - \ -static inline unsigned type in##bwl(int port) \ -{ \ - unsigned type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ - : "=a"(value) : "Nd"(port)); \ - return value; \ -} \ - \ -static inline void out##bwl##_p(unsigned type value, int port) \ -{ \ - out##bwl(value, port); \ - slow_down_io(); \ -} \ - \ -static inline unsigned type in##bwl##_p(int port) \ -{ \ - unsigned type value = in##bwl(port); \ - slow_down_io(); \ - return value; \ -} \ - \ -static inline void outs##bwl(int port, const void *addr, unsigned long count) \ -{ \ - asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ -} \ - \ -static inline void ins##bwl(int port, void *addr, unsigned long count) \ -{ \ - asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ -} - -BUILDIO(b, b, char) -BUILDIO(w, w, short) -BUILDIO(l, , int) - -#endif /* _ASM_X86_IO_64_H */ -- cgit v1.2.2 From 841582ea9e29a8f757c30c5377ce649586ba793a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 2 Feb 2010 14:38:14 -0800 Subject: x86, uv: Update UV arch to target Legacy VGA I/O correctly. Add function to direct Legacy VGA I/O traffic to correct I/O Hub. Signed-off-by: Mike Travis LKML-Reference: <201002022238.o12McEbi018727@imap1.linux-foundation.org> Cc: Thomas Gleixner Cc: Robin Holt Cc: Jack Steiner Cc: Ingo Molnar Cc: Jesse Barnes Cc: David Airlie Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 4 +++- arch/x86/kernel/apic/x2apic_uv_x.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/bios_uv.c | 19 +++++++++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 2751f3075d8b..163427597d03 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -36,7 +36,8 @@ enum uv_bios_cmd { UV_BIOS_WATCHLIST_ALLOC, UV_BIOS_WATCHLIST_FREE, UV_BIOS_MEMPROTECT, - UV_BIOS_GET_PARTITION_ADDR + UV_BIOS_GET_PARTITION_ADDR, + UV_BIOS_SET_LEGACY_VGA_TARGET }; /* @@ -96,6 +97,7 @@ extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); +extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); extern void uv_bios_init(void); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 21db3cbea7dc..6ef2899eb861 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -34,6 +35,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); +#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) + static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; int uv_min_hub_revision_id; @@ -553,6 +556,30 @@ late_initcall(uv_init_heartbeat); #endif /* !CONFIG_HOTPLUG_CPU */ +/* Direct Legacy VGA I/O traffic to designated IOH */ +int uv_set_vga_state(struct pci_dev *pdev, bool decode, + unsigned int command_bits, bool change_bridge) +{ + int domain, bus, rc; + + PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n", + pdev->devfn, decode, command_bits, change_bridge); + + if (!change_bridge) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); + + return rc; +} + /* * Called on each cpu to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet @@ -691,4 +718,7 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); proc_mkdir("sgi_uv", NULL); + + /* register Legacy VGA I/O redirection handler */ + pci_register_set_vga_state(uv_set_vga_state); } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index b0206a211b09..575127a6e352 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -154,6 +154,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) } EXPORT_SYMBOL_GPL(uv_bios_freq_base); +/* + * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target + * @decode: true to enable target, false to disable target + * @domain: PCI domain number + * @bus: PCI bus number + * + * Returns: + * 0: Success + * -EINVAL: Invalid domain or bus number + * -ENOSYS: Capability not available + * -EBUSY: Legacy VGA I/O cannot be retargeted at this time + */ +int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) +{ + return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, + (u64)decode, (u64)domain, (u64)bus, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); + #ifdef CONFIG_EFI void uv_bios_init(void) -- cgit v1.2.2 From 3235dc3f22378f35ce77eba0d0f62db2d9c4844e Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Sat, 6 Feb 2010 18:47:17 +0100 Subject: x86: Remove trailing spaces in messages Signed-off-by: Frans Pop Cc: Avi Kivity Cc: x86@kernel.org LKML-Reference: <1265478443-31072-10-git-send-email-elendil@planet.nl> [ Left out the KVM bits. ] Signed-off-by: Ingo Molnar --- arch/x86/boot/mkcpustr.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/apm_32.c | 4 ++-- arch/x86/kernel/efi.c | 2 +- arch/x86/kernel/microcode_intel.c | 2 +- arch/x86/kernel/uv_sysfs.c | 6 +++--- arch/x86/tools/test_get_len.c | 4 ++-- 9 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 8ef60f20b371..919257f526f2 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -22,7 +22,7 @@ int main(void) int i, j; const char *str; - printf("static const char x86_cap_strs[] = \n"); + printf("static const char x86_cap_strs[] =\n"); for (i = 0; i < NCAPINTS; i++) { for (j = 0; j < 32; j++) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e80f291472a4..71c4443bb91f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -587,7 +587,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld) \n", + "PM-Timer: %lu (%ld)\n", (unsigned long)res, *deltatsc); *deltatsc = (long)res; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816d..6bdd2c7ead75 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1647,7 +1647,7 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); + " Stat Dmod Deli Vect:\n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 98c4665f251c..47dd856708e5 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) mpc_record = 0; printk(KERN_INFO - "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + "Found an OEM MPC table at %8p - parsing it...\n", oemtable); if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { printk(KERN_WARNING diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index b5b6b23bce53..031aa887b0eb 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d) apm_info.disabled = 1; printk(KERN_INFO "%s machine detected. " "Disabling APM.\n", d->ident); - printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); - printk(KERN_INFO "download from support.intel.com \n"); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); } return 0; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index cdcfb122f256..c2fa9b8b497e 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -362,7 +362,7 @@ void __init efi_init(void) printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); early_iounmap(tmp, 2); - printk(KERN_INFO "EFI v%u.%.02u by %s \n", + printk(KERN_INFO "EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index ebd193e476ca..85a343e28937 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -328,7 +328,7 @@ static int apply_microcode(int cpu) cpu_num, mc_intel->hdr.rev); return -1; } - pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c index 36afb98675a4..309c70fb7759 100644 --- a/arch/x86/kernel/uv_sysfs.c +++ b/arch/x86/kernel/uv_sysfs.c @@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void) if (!sgi_uv_kobj) sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); return -EINVAL; } ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); return ret; } ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); return ret; } diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index bee8d6ac2691..13403fc95a96 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -43,7 +43,7 @@ static int x86_64; static void usage(void) { fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" - " %s [-y|-n] [-v] \n", prog); + " %s [-y|-n] [-v]\n", prog); fprintf(stderr, "\t-y 64bit mode\n"); fprintf(stderr, "\t-n 32bit mode\n"); fprintf(stderr, "\t-v verbose mode\n"); @@ -69,7 +69,7 @@ static void dump_field(FILE *fp, const char *name, const char *indent, static void dump_insn(FILE *fp, struct insn *insn) { - fprintf(fp, "Instruction = { \n"); + fprintf(fp, "Instruction = {\n"); dump_field(fp, "prefixes", "\t", &insn->prefixes); dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); -- cgit v1.2.2 From 076dc4a65a6d99a16979e2c7917e669fb8c91ee5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 5 Feb 2010 12:16:47 -0500 Subject: x86/alternatives: Fix build warning Fixes these warnings: arch/x86/kernel/alternative.c: In function 'alternatives_text_reserved': arch/x86/kernel/alternative.c:402: warning: comparison of distinct pointer types lacks a cast arch/x86/kernel/alternative.c:402: warning: comparison of distinct pointer types lacks a cast arch/x86/kernel/alternative.c:405: warning: comparison of distinct pointer types lacks a cast arch/x86/kernel/alternative.c:405: warning: comparison of distinct pointer types lacks a cast Caused by: 2cfa197: ftrace/alternatives: Introducing *_text_reserved functions Changes in v2: - Use local variables to compare, instead of type casts. Reported-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE LKML-Reference: <20100205171647.15750.37221.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3c13284ff86d..e63b80e5861c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -395,12 +395,14 @@ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; u8 **ptr; + u8 *text_start = start; + u8 *text_end = end; list_for_each_entry(mod, &smp_alt_modules, next) { - if (mod->text > end || mod->text_end < start) + if (mod->text > text_end || mod->text_end < text_start) continue; for (ptr = mod->locks; ptr < mod->locks_end; ptr++) - if (start <= *ptr && end >= *ptr) + if (text_start <= *ptr && text_end >= *ptr) return 1; } -- cgit v1.2.2 From 3ad2f3fbb961429d2aa627465ae4829758bc7e07 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 3 Feb 2010 08:01:28 +0800 Subject: tree-wide: Assorted spelling fixes In particular, several occurances of funny versions of 'success', 'unknown', 'therefore', 'acknowledge', 'argument', 'achieve', 'address', 'beginning', 'desirable', 'separate' and 'necessary' are fixed. Signed-off-by: Daniel Mack Cc: Joe Perches Cc: Junio C Hamano Signed-off-by: Jiri Kosina --- arch/x86/crypto/twofish-i586-asm_32.S | 10 +++++----- arch/x86/crypto/twofish-x86_64-asm_64.S | 20 ++++++++++---------- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/tsc.c | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 39b98ed2c1b9..575331cb2a8a 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -22,7 +22,7 @@ #include -/* return adress at 0 */ +/* return address at 0 */ #define in_blk 12 /* input byte array address parameter*/ #define out_blk 8 /* output byte array address parameter*/ @@ -230,8 +230,8 @@ twofish_enc_blk: push %edi mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ - add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ - mov in_blk+16(%esp),%edi /* input adress in edi */ + add $crypto_tfm_ctx_offset, %ebp /* ctx address */ + mov in_blk+16(%esp),%edi /* input address in edi */ mov (%edi), %eax mov b_offset(%edi), %ebx @@ -286,8 +286,8 @@ twofish_dec_blk: mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ - add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ - mov in_blk+16(%esp),%edi /* input adress in edi */ + add $crypto_tfm_ctx_offset, %ebp /* ctx address */ + mov in_blk+16(%esp),%edi /* input address in edi */ mov (%edi), %eax mov b_offset(%edi), %ebx diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 35974a586615..573aa102542e 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -221,11 +221,11 @@ twofish_enc_blk: pushq R1 - /* %rdi contains the crypto tfm adress */ - /* %rsi contains the output adress */ - /* %rdx contains the input adress */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ - /* ctx adress is moved to free one non-rex register + /* %rdi contains the crypto tfm address */ + /* %rsi contains the output address */ + /* %rdx contains the input address */ + add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ + /* ctx address is moved to free one non-rex register as target for the 8bit high operations */ mov %rdi, %r11 @@ -274,11 +274,11 @@ twofish_enc_blk: twofish_dec_blk: pushq R1 - /* %rdi contains the crypto tfm adress */ - /* %rsi contains the output adress */ - /* %rdx contains the input adress */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ - /* ctx adress is moved to free one non-rex register + /* %rdi contains the crypto tfm address */ + /* %rsi contains the output address */ + /* %rdx contains the input address */ + add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ + /* ctx address is moved to free one non-rex register as target for the 8bit high operations */ mov %rdi, %r11 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2d8b5035371c..3d1e6f16b7a6 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -27,7 +27,7 @@ #define GET_CR2_INTO_RCX movq %cr2, %rcx #endif -/* we are not able to switch in one step to the final KERNEL ADRESS SPACE +/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. * */ diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 2bbde6078143..fb99f7edb341 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) /* * get_tce_space_from_tar(): * Function for kdump case. Get the tce tables from first kernel - * by reading the contents of the base adress register of calgary iommu + * by reading the contents of the base address register of calgary iommu */ static void __init get_tce_space_from_tar(void) { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba0..dec8f68e3eda 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -50,7 +50,7 @@ u64 native_sched_clock(void) * unstable. We do this because unlike Time Of Day, * the scheduler clock tolerates small errors and it's * very important for it to be as fast as the platform - * can achive it. ) + * can achieve it. ) */ if (unlikely(tsc_disabled)) { /* No locking but a rare wrong value is not a big deal: */ -- cgit v1.2.2 From 923de3cf5bf12049628019010e36623fca5ef6d1 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 27 Jan 2010 19:13:49 +0800 Subject: kvmclock: count total_sleep_time when updating guest clock Current kvm wallclock does not consider the total_sleep_time which could cause wrong wallclock in guest after host suspend/resume. This patch solve this issue by counting total_sleep_time to get the correct host boot time. Cc: stable@kernel.org Signed-off-by: Jason Wang Acked-by: Glauber Costa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ddcad452add..a1e1bc9d412d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -670,7 +670,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { static int version; struct pvclock_wall_clock wc; - struct timespec now, sys, boot; + struct timespec boot; if (!wall_clock) return; @@ -685,9 +685,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - now = current_kernel_time(); - ktime_get_ts(&sys); - boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + getboottime(&boot); wc.sec = boot.tv_sec; wc.nsec = boot.tv_nsec; @@ -762,6 +760,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); local_irq_restore(flags); /* With all the info we got, fill in the values */ -- cgit v1.2.2 From ee73f656a604d5aa9df86a97102e4e462dd79924 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 29 Jan 2010 17:28:41 -0200 Subject: KVM: PIT: control word is write-only PIT control word (address 0x43) is write-only, reads are undefined. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 296aba49472a..15578f180e59 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -467,6 +467,9 @@ static int pit_ioport_read(struct kvm_io_device *this, return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; + if (addr == 3) + return 0; + s = &pit_state->channels[addr]; mutex_lock(&pit_state->lock); -- cgit v1.2.2 From cf9db6c41f739a294286847aab1e85f39aef1781 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 8 Feb 2010 20:35:02 -0600 Subject: x86-32: Make AT_VECTOR_SIZE_ARCH=2 Both x86-32 and x86-64 with 32-bit compat use ARCH_DLINFO_IA32, which defines two saved_auxv entries. But system.h only defines AT_VECTOR_SIZE_ARCH as 2 for CONFIG_IA32_EMULATION, not for CONFIG_X86_32. Fix that. Signed-off-by: Serge E. Hallyn LKML-Reference: <20100209023502.GA15408@us.ibm.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/system.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index ecb544e65382..e04740f7a0bb 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -11,9 +11,9 @@ #include /* entries in ARCH_DLINFO: */ -#ifdef CONFIG_IA32_EMULATION +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) # define AT_VECTOR_SIZE_ARCH 2 -#else +#else /* else it's non-compat x86-64 */ # define AT_VECTOR_SIZE_ARCH 1 #endif -- cgit v1.2.2 From 681ee44d40d7c93b42118320e4620d07d8704fd6 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 9 Feb 2010 18:01:44 -0800 Subject: x86, apic: Don't use logical-flat mode when CPU hotplug may exceed 8 CPUs We need to fall back from logical-flat APIC mode to physical-flat mode when we have more than 8 CPUs. However, in the presence of CPU hotplug(with bios listing not enabled but possible cpus as disabled cpus in MADT), we have to consider the number of possible CPUs rather than the number of current CPUs; otherwise we may cross the 8-CPU boundary when CPUs are added later. 32bit apic code can use more cleanups (like the removal of vendor checks in 32bit default_setup_apic_routing()) and more unifications with 64bit code. Yinghai has some patches in works already. This patch addresses the boot issue that is reported in the virtualization guest context. [ hpa: incorporated function annotation feedback from Yinghai Lu ] Signed-off-by: Suresh Siddha LKML-Reference: <1265767304.2833.19.camel@sbs-t61.sc.intel.com> Acked-by: Shaohui Zheng Reviewed-by: Yinghai Lu Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/boot.c | 5 ----- arch/x86/kernel/apic/apic.c | 17 ----------------- arch/x86/kernel/apic/probe_32.c | 29 +++++++++++++++++++++++++++-- arch/x86/kernel/apic/probe_64.c | 2 +- arch/x86/kernel/mpparse.c | 7 ------- arch/x86/kernel/smpboot.c | 2 -- 6 files changed, 28 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 036d28adf59d..0acbcdfa5ca4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1185,9 +1185,6 @@ static void __init acpi_process_madt(void) if (!error) { acpi_lapic = 1; -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif /* * Parse MADT IO-APIC entries */ @@ -1197,8 +1194,6 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - if (apic->setup_apic_routing) - apic->setup_apic_routing(); } } if (error == -EINVAL) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3987e4408f75..dfca210f6a10 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1641,9 +1641,7 @@ int __init APIC_init_uniprocessor(void) #endif enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif verify_local_APIC(); connect_bsp_APIC(); @@ -1891,21 +1889,6 @@ void __cpuinit generic_processor_info(int apicid, int version) if (apicid > max_physical_apicid) max_physical_apicid = apicid; -#ifdef CONFIG_X86_32 - if (num_processors > 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } -#endif - #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1a6559f6768c..99d2fe016084 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -52,7 +52,32 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void default_setup_apic_routing(void) +void __init default_setup_apic_routing(void) +{ + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + +#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); +} + +static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC printk(KERN_INFO @@ -103,7 +128,7 @@ struct apic apic_default = { .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = default_setup_apic_routing, + .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 450fe2064a14..83e9be4778e2 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -67,7 +67,7 @@ void __init default_setup_apic_routing(void) } #endif - if (apic == &apic_flat && num_processors > 8) + if (apic == &apic_flat && num_possible_cpus() > 8) apic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 40b54ceb68b5..a2c1edd2d3ac 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) x86_init.mpparse.mpc_record(1); } -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); - if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..b4e870cbdc60 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1083,9 +1083,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); -- cgit v1.2.2 From 0271f91003d3703675be13b8865618359a6caa1f Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Thu, 4 Feb 2010 19:06:33 +0800 Subject: x86, acpi: Map hotadded cpu to correct node. When hotadd new cpu to system, if its affinitive node is online, should map the cpu to its own node. Otherwise, let kernel select one online node for the new cpu later. Signed-off-by: Haicheng Li LKML-Reference: <4B6AAA39.6000300@linux.intel.com> Tested-by: Thomas Renninger Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/boot.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 036d28adf59d..7db15e161aa0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -49,6 +49,7 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include +# include #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -482,6 +483,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid == -1 || !node_online(nid)) + return; +#ifdef CONFIG_X86_64 + apicid_to_node[physid] = nid; + numa_set_node(cpu, nid); +#else /* CONFIG_X86_32 */ + apicid_2_node[physid] = nid; + cpu_to_node_map[cpu] = nid; +#endif + +#endif +} + static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -540,6 +560,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) } cpu = cpumask_first(new_map); + acpi_map_cpu2node(handle, cpu, physid); *pcpu = cpu; retval = 0; -- cgit v1.2.2 From 318f6b228ba88a394ef560efc1bfe028ad5ae6b6 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 10 Feb 2010 20:55:16 +0100 Subject: x86, ia32_aout: do not kill argument mapping Do not set current->mm->mmap to NULL in 32-bit emulation on 64-bit load_aout_binary after flush_old_exec as it would destroy already set brpm mapping with arguments. Introduced by b6a2fea39318e43fee84fa7b0b90d68bed92d2ba mm: variable length argument support where the argument mapping in bprm was added. [ hpa: this is a regression from 2.6.22... time to kill a.out? ] Signed-off-by: Jiri Slaby LKML-Reference: <1265831716-7668-1-git-send-email-jslaby@suse.cz> Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ollie Wild Cc: x86@kernel.org Cc: Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_aout.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index f9f472462753..14531abdd0ce 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -327,7 +327,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->free_area_cache = TASK_UNMAPPED_BASE; current->mm->cached_hole_size = 0; - current->mm->mmap = NULL; install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; -- cgit v1.2.2 From 18dce6ba5c8c6bd0f3ab4efa4cbdd698dab5c40a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:05 -0800 Subject: x86: Fix SCI on IOAPIC != 0 Thomas Renninger reported on IBM x3330 booting a latest kernel on this machine results in: PCI: PCI BIOS revision 2.10 entry at 0xfd61c, last bus=1 PCI: Using configuration type 1 for base access bio: create slab at 0 ACPI: SCI (IRQ30) allocation failed ACPI Exception: AE_NOT_ACQUIRED, Unable to install System Control Interrupt handler (20090903/evevent-161) ACPI: Unable to start the ACPI Interpreter Later all kind of devices fail... and bisect it down to this commit: commit b9c61b70075c87a8612624736faf4a2de5b1ed30 x86/pci: update pirq_enable_irq() to setup io apic routing it turns out we need to set irq routing for the sci on ioapic1 early. -v2: make it work without sparseirq too. -v3: fix checkpatch.pl warning, and cc to stable Reported-by: Thomas Renninger Bisected-by: Thomas Renninger Tested-by: Thomas Renninger Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-2-git-send-email-yinghai@kernel.org> Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_apic.h | 1 + arch/x86/kernel/acpi/boot.c | 9 +++++++- arch/x86/kernel/apic/io_apic.c | 50 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 7c7c16cde1f8..5f61f6e0ffdd 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -160,6 +160,7 @@ extern int io_apic_get_redir_entries(int ioapic); struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); +void setup_IO_APIC_irq_extra(u32 gsi); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); extern void ioapic_insert_resources(void); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdfa5ca4..5c96b75c6ea8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -446,6 +446,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { *irq = gsi; + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) + setup_IO_APIC_irq_extra(gsi); +#endif + return 0; } @@ -473,7 +479,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif - acpi_gsi_to_irq(plat_gsi, &irq); + irq = plat_gsi; + return irq; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816d..5e4cce254e43 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1538,6 +1538,56 @@ static void __init setup_IO_APIC_irqs(void) " (apicid-pin) not connected\n"); } +/* + * for the gsit that is not in first ioapic + * but could not use acpi_register_gsi() + * like some special sci in IBM x3330 + */ +void setup_IO_APIC_irq_extra(u32 gsi) +{ + int apic_id = 0, pin, idx, irq; + int node = cpu_to_node(boot_cpu_id); + struct irq_desc *desc; + struct irq_cfg *cfg; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + apic_id = mp_find_ioapic(gsi); + if (apic_id < 0) + return; + + pin = mp_find_ioapic_pin(apic_id, gsi); + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) + return; + + irq = pin_2_irq(idx, apic_id, pin); +#ifdef CONFIG_SPARSE_IRQ + desc = irq_to_desc(irq); + if (desc) + return; +#endif + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + return; + } + + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + + if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[apic_id].apicid, pin); + return; + } + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); +} + /* * Set up the timer pin, possibly with the 8259A-master behind. */ -- cgit v1.2.2 From ced5b697a76d325e7a7ac7d382dbbb632c765093 Mon Sep 17 00:00:00 2001 From: Brandon Phiilps Date: Wed, 10 Feb 2010 01:20:06 -0800 Subject: x86: Avoid race condition in pci_enable_msix() Keep chip_data in create_irq_nr and destroy_irq. When two drivers are setting up MSI-X at the same time via pci_enable_msix() there is a race. See this dmesg excerpt: [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X [ 85.170611] alloc irq_desc for 99 on node -1 [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X [ 85.170614] alloc kstat_irqs on node -1 [ 85.170616] alloc irq_2_iommu on node -1 [ 85.170617] alloc irq_desc for 100 on node -1 [ 85.170619] alloc kstat_irqs on node -1 [ 85.170621] alloc irq_2_iommu on node -1 [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X [ 85.170626] alloc irq_desc for 101 on node -1 [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X [ 85.170630] alloc kstat_irqs on node -1 [ 85.170631] alloc irq_2_iommu on node -1 [ 85.170635] alloc irq_desc for 102 on node -1 [ 85.170636] alloc kstat_irqs on node -1 [ 85.170639] alloc irq_2_iommu on node -1 [ 85.170646] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 As you can see igb and ixgbe are both alternating on create_irq_nr() via pci_enable_msix() in their probe function. ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data = NULL via dynamic_irq_init(). igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[] via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this: cfg_new = irq_desc_ptrs[102]->chip_data; if (cfg_new->vector != 0) continue; This hits the NULL deref. Another possible race exists via pci_disable_msix() in a driver or in the number of error paths that call free_msi_irqs(): destroy_irq() dynamic_irq_cleanup() which sets desc->chip_data = NULL ...race window... desc->chip_data = cfg; Remove the save and restore code for cfg in create_irq_nr() and destroy_irq() and take the desc->lock when checking the irq_cfg. Reported-and-analyzed-by: Brandon Philips Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-3-git-send-email-yinghai@kernel.org> Signed-off-by: Brandon Phililps Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816d..c86591b906fa 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3228,12 +3228,9 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) { - dynamic_irq_init(irq); - /* restore it, in case dynamic_irq_init clear it */ - if (desc_new) - desc_new->chip_data = cfg_new; - } + if (irq > 0) + dynamic_irq_init_keep_chip_data(irq); + return irq; } @@ -3256,17 +3253,12 @@ void destroy_irq(unsigned int irq) { unsigned long flags; struct irq_cfg *cfg; - struct irq_desc *desc; - /* store it, in case dynamic_irq_cleanup clear it */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - dynamic_irq_cleanup(irq); - /* connect back irq_cfg */ - desc->chip_data = cfg; + dynamic_irq_cleanup_keep_chip_data(irq); free_irte(irq); spin_lock_irqsave(&vector_lock, flags); + cfg = irq_to_desc(irq)->chip_data; __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); } -- cgit v1.2.2 From 27811d8cabe56e0c3622251b049086f49face4ff Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:07 -0800 Subject: x86: Move range related operation to one file We have almost the same code for mtrr cleanup and amd_bus checkup, and this code will also be used in replacing bootmem with early_res, so try to move them together and reuse it from different parts. Also rename update_range to subtract_range as that is what the function is actually doing. -v2: update comments as Christoph requested Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-4-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 180 ++++--------------------------------- arch/x86/kernel/mmconf-fam10h_64.c | 7 +- arch/x86/pci/amd_bus.c | 70 +++------------ 3 files changed, 28 insertions(+), 229 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 09b1698e0466..669da09ab9a8 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -22,10 +22,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -34,11 +34,6 @@ #include "mtrr.h" -struct res_range { - unsigned long start; - unsigned long end; -}; - struct var_mtrr_range_state { unsigned long base_pfn; unsigned long size_pfn; @@ -56,7 +51,7 @@ struct var_mtrr_state { /* Should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 -static struct res_range __initdata range[RANGE_NUM]; +static struct range __initdata range[RANGE_NUM]; static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; @@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; #define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) - -static int __init -add_range(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - /* Out of slots: */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -static int __init -add_range_with_merge(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - int i; - - /* Try to merge it with old one: */ - for (i = 0; i < nr_range; i++) { - unsigned long final_start, final_end; - unsigned long common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end + 1) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* Need to add it: */ - return add_range(range, nr_range, start, end); -} - -static void __init -subtract_range(struct res_range *range, unsigned long start, unsigned long end) -{ - int i, j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* Find the new spare: */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static int __init cmp_range(const void *x1, const void *x2) -{ - const struct res_range *r1 = x1; - const struct res_range *r2 = x2; - long start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - -static int __init clean_sort_range(struct res_range *range, int az) -{ - int i, j, k = az - 1, nr_range = 0; - - for (i = 0; i < k; i++) { - if (range[i].end) - continue; - for (j = k; j > i; j--) { - if (range[j].end) { - k = j; - break; - } - } - if (j == i) - break; - range[i].start = range[k].start; - range[i].end = range[k].end; - range[k].start = 0; - range[k].end = 0; - k--; - } - /* count it */ - for (i = 0; i < az; i++) { - if (!range[i].end) { - nr_range = i; - break; - } - } - - /* sort them */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - - return nr_range; -} - #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, +x86_get_mtrr_mem_range(struct range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) { @@ -223,13 +77,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, continue; base = range_state[i].base_pfn; size = range_state[i].size_pfn; - nr_range = add_range_with_merge(range, nr_range, base, - base + size - 1); + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size - 1); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } @@ -252,10 +106,10 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size - 1); } if (extra_remove_size) - subtract_range(range, extra_remove_base, + subtract_range(range, RANGE_NUM, extra_remove_base, extra_remove_base + extra_remove_size - 1); if (debug_print) { @@ -263,7 +117,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } } @@ -273,20 +127,16 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } - /* clear those is not used */ - for (i = nr_range; i < RANGE_NUM; i++) - memset(&range[i], 0, sizeof(range[i])); - return nr_range; } #ifdef CONFIG_MTRR_SANITIZER -static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +static unsigned long __init sum_ranges(struct range *range, int nr_range) { unsigned long sum = 0; int i; @@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg) early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init -x86_setup_var_mtrrs(struct res_range *range, int nr_range, +x86_setup_var_mtrrs(struct range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; @@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size, unsigned long x_remove_base, unsigned long x_remove_size, int i) { - static struct res_range range_new[RANGE_NUM]; + static struct range range_new[RANGE_NUM]; unsigned long range_sums_new; static int nr_range_new; int num_reg; @@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits) * [0, 1M) should always be covered by var mtrr with WB * and fixed mtrrs should take effect before var mtrr for it: */ - nr_range = add_range_with_merge(range, nr_range, 0, + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); /* Sort the ranges: */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + sort_range(range, nr_range); range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM covered: %ldM\n", diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 712d15fdc416..71825806cd44 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -7,6 +7,8 @@ #include #include #include +#include + #include #include #include @@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, }; -struct range { - u64 start; - u64 end; -}; - static int __cpuinit cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 95ecbd495955..2356ea18697d 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -2,6 +2,8 @@ #include #include #include +#include + #include #ifdef CONFIG_X86_64 @@ -17,58 +19,6 @@ #ifdef CONFIG_X86_64 -#define RANGE_NUM 16 - -struct res_range { - size_t start; - size_t end; -}; - -static void __init update_range(struct res_range *range, size_t start, - size_t end) -{ - int i; - int j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* find the new spare */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - struct pci_hostbridge_probe { u32 bus; u32 slot; @@ -111,6 +61,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void) fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; } +#define RANGE_NUM 16 + /** * early_fill_mp_bus_to_node() * called before pcibios_scan_root and pci_scan_bus @@ -132,7 +84,7 @@ static int __init early_fill_mp_bus_info(void) struct resource *res; size_t start; size_t end; - struct res_range range[RANGE_NUM]; + struct range range[RANGE_NUM]; u64 val; u32 address; @@ -226,7 +178,7 @@ static int __init early_fill_mp_bus_info(void) if (end > 0xffff) end = 0xffff; update_res(info, start, end, IORESOURCE_IO, 1); - update_range(range, start, end); + subtract_range(range, RANGE_NUM, start, end); } /* add left over io port range to def node/link, [0, 0xffff] */ /* find the position */ @@ -256,14 +208,14 @@ static int __init early_fill_mp_bus_info(void) end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); if (end < (1ULL<<32)) - update_range(range, 0, end - 1); + subtract_range(range, RANGE_NUM, 0, end - 1); /* get mmconfig */ get_pci_mmcfg_amd_fam10h_range(); /* need to take out mmconf range */ if (fam10h_mmconf_end) { printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); - update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); + subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end); } /* mmio resource */ @@ -318,7 +270,7 @@ static int __init early_fill_mp_bus_info(void) /* we got a hole */ endx = fam10h_mmconf_start - 1; update_res(info, start, endx, IORESOURCE_MEM, 0); - update_range(range, start, endx); + subtract_range(range, RANGE_NUM, start, endx); printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); start = fam10h_mmconf_end + 1; changed = 1; @@ -334,7 +286,7 @@ static int __init early_fill_mp_bus_info(void) } update_res(info, start, end, IORESOURCE_MEM, 1); - update_range(range, start, end); + subtract_range(range, RANGE_NUM, start, end); printk(KERN_CONT "\n"); } @@ -349,7 +301,7 @@ static int __init early_fill_mp_bus_info(void) rdmsrl(address, val); end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); - update_range(range, 1ULL<<32, end - 1); + subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1); } /* -- cgit v1.2.2 From b74fd238a9cf39a81d94152f375b756bf795b4af Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:08 -0800 Subject: x86/pci: Use resource_size_t in update_res Prepare to enable 32bit intel and amd bus. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-5-git-send-email-yinghai@kernel.org> Acked-by: Jesse Barnes Signed-off-by: H. Peter Anvin --- arch/x86/pci/bus_numa.c | 16 ++++++++-------- arch/x86/pci/bus_numa.h | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index f939d603adfa..30f65cecd65f 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -51,8 +51,8 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) } } -void __devinit update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge) +void __devinit update_res(struct pci_root_info *info, resource_size_t start, + resource_size_t end, unsigned long flags, int merge) { int i; struct resource *res; @@ -65,20 +65,20 @@ void __devinit update_res(struct pci_root_info *info, size_t start, /* try to merge it with old one */ for (i = 0; i < info->res_num; i++) { - size_t final_start, final_end; - size_t common_start, common_end; + resource_size_t final_start, final_end; + resource_size_t common_start, common_end; res = &info->res[i]; if (res->flags != flags) continue; - common_start = max((size_t)res->start, start); - common_end = min((size_t)res->end, end); + common_start = max(res->start, start); + common_end = min(res->end, end); if (common_start > common_end + 1) continue; - final_start = min((size_t)res->start, start); - final_end = max((size_t)res->end, end); + final_start = min(res->start, start); + final_end = max(res->end, end); res->start = final_start; res->end = final_end; diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index adbc23fe82ac..374ecc5ead42 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -22,6 +22,6 @@ extern int pci_root_num; extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; extern int found_all_numa_early; -extern void update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge); +extern void update_res(struct pci_root_info *info, resource_size_t start, + resource_size_t end, unsigned long flags, int merge); #endif -- cgit v1.2.2 From 3e3da00c01d050307e753fb7b3e84aefc16da0d0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:09 -0800 Subject: x86/pci: AMD one chain system to use pci read out res Found MSI amd k8 based laptops is hiding [0x70000000, 0x80000000) RAM from e820. enable amd one chain even for all. -v2: use bool for found, according to Andrew Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-6-git-send-email-yinghai@kernel.org> Acked-by: Jesse Barnes Signed-off-by: H. Peter Anvin --- arch/x86/pci/amd_bus.c | 7 ++++--- arch/x86/pci/bus_numa.c | 5 ----- arch/x86/pci/bus_numa.h | 1 - 3 files changed, 4 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 2356ea18697d..ae50b8fa0c60 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -87,11 +87,12 @@ static int __init early_fill_mp_bus_info(void) struct range range[RANGE_NUM]; u64 val; u32 address; + bool found; if (!early_pci_allowed()) return -1; - found_all_numa_early = 0; + found = false; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { u32 id; u16 device; @@ -105,12 +106,12 @@ static int __init early_fill_mp_bus_info(void) device = (id>>16) & 0xffff; if (pci_probes[i].vendor == vendor && pci_probes[i].device == device) { - found_all_numa_early = 1; + found = true; break; } } - if (!found_all_numa_early) + if (!found) return 0; pci_root_num = 0; diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 30f65cecd65f..3411687b676e 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -5,7 +5,6 @@ int pci_root_num; struct pci_root_info pci_root_info[PCI_ROOT_NR]; -int found_all_numa_early; void x86_pci_root_bus_res_quirks(struct pci_bus *b) { @@ -21,10 +20,6 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) if (!pci_root_num) return; - /* for amd, if only one root bus, don't need to do anything */ - if (pci_root_num < 2 && found_all_numa_early) - return; - for (i = 0; i < pci_root_num; i++) { if (pci_root_info[i].bus_min == b->number) break; diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index 374ecc5ead42..f63e80294f22 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -20,7 +20,6 @@ struct pci_root_info { #define PCI_ROOT_NR 4 extern int pci_root_num; extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; -extern int found_all_numa_early; extern void update_res(struct pci_root_info *info, resource_size_t start, resource_size_t end, unsigned long flags, int merge); -- cgit v1.2.2 From 97445c3b86e0b64e059b4829a7193f8e26fb5bfc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:10 -0800 Subject: x86/pci: Use u64 instead of size_t in amd_bus.c Prepare to enable it for 32bit. -v2: remove not needed cast Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-7-git-send-email-yinghai@kernel.org> Acked-by: Jesse Barnes Signed-off-by: H. Peter Anvin --- arch/x86/pci/amd_bus.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index ae50b8fa0c60..f06bb1b4a80a 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -82,8 +82,8 @@ static int __init early_fill_mp_bus_info(void) struct pci_root_info *info; u32 reg; struct resource *res; - size_t start; - size_t end; + u64 start; + u64 end; struct range range[RANGE_NUM]; u64 val; u32 address; @@ -173,7 +173,7 @@ static int __init early_fill_mp_bus_info(void) info = &pci_root_info[j]; printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", - node, link, (u64)start, (u64)end); + node, link, start, end); /* kernel only handle 16 bit only */ if (end > 0xffff) @@ -207,7 +207,7 @@ static int __init early_fill_mp_bus_info(void) address = MSR_K8_TOP_MEM1; rdmsrl(address, val); end = (val & 0xffffff800000ULL); - printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); + printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20); if (end < (1ULL<<32)) subtract_range(range, RANGE_NUM, 0, end - 1); @@ -246,7 +246,7 @@ static int __init early_fill_mp_bus_info(void) info = &pci_root_info[j]; printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", - node, link, (u64)start, (u64)end); + node, link, start, end); /* * some sick allocation would have range overlap with fam10h * mmconf range, so need to update start and end. @@ -272,13 +272,13 @@ static int __init early_fill_mp_bus_info(void) endx = fam10h_mmconf_start - 1; update_res(info, start, endx, IORESOURCE_MEM, 0); subtract_range(range, RANGE_NUM, start, endx); - printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); + printk(KERN_CONT " ==> [%llx, %llx]", start, endx); start = fam10h_mmconf_end + 1; changed = 1; } if (changed) { if (start <= end) { - printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); + printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end); } else { printk(KERN_CONT "%s\n", endx?"":" ==> none"); continue; @@ -301,7 +301,7 @@ static int __init early_fill_mp_bus_info(void) address = MSR_K8_TOP_MEM2; rdmsrl(address, val); end = (val & 0xffffff800000ULL); - printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); + printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20); subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1); } -- cgit v1.2.2 From 9ad3f2c7c69659c343843393944d739fec1f2e73 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:11 -0800 Subject: x86/pci: Add cap_resource() Prepare for 32bit pci root bus -v2: hpa said we should compare with (resource_size_t)~0 -v3: according to Linus to use MAX_RESOURCE instead. also need need to put related patches together -v4: according to Andrew, use min in cap_resource() Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-8-git-send-email-yinghai@kernel.org> Acked-by: Jesse Barnes Signed-off-by: H. Peter Anvin --- arch/x86/pci/amd_bus.c | 8 +++++--- arch/x86/pci/bus_numa.c | 4 ++++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index f06bb1b4a80a..f7e13b63154e 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -201,7 +201,7 @@ static int __init early_fill_mp_bus_info(void) memset(range, 0, sizeof(range)); /* 0xfd00000000-0xffffffffff for HT */ - range[0].end = (0xfdULL<<32) - 1; + range[0].end = cap_resource((0xfdULL<<32) - 1); /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; @@ -286,7 +286,8 @@ static int __init early_fill_mp_bus_info(void) } } - update_res(info, start, end, IORESOURCE_MEM, 1); + update_res(info, cap_resource(start), cap_resource(end), + IORESOURCE_MEM, 1); subtract_range(range, RANGE_NUM, start, end); printk(KERN_CONT "\n"); } @@ -321,7 +322,8 @@ static int __init early_fill_mp_bus_info(void) if (!range[i].end) continue; - update_res(info, range[i].start, range[i].end, + update_res(info, cap_resource(range[i].start), + cap_resource(range[i].end), IORESOURCE_MEM, 1); } } diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 3411687b676e..3510778aa8bb 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -1,5 +1,6 @@ #include #include +#include #include "bus_numa.h" @@ -55,6 +56,9 @@ void __devinit update_res(struct pci_root_info *info, resource_size_t start, if (start > end) return; + if (start == MAX_RESOURCE) + return; + if (!merge) goto addit; -- cgit v1.2.2 From 284f933d45a1e60404328440910bc2651c0fb51d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:12 -0800 Subject: x86/pci: Enable pci root res read out for 32bit too Should be good for 32bit too. -v3: cast res->start -v4: according to Linus, to use %pR instead of cast Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-9-git-send-email-yinghai@kernel.org> Acked-by: Jesse Barnes Signed-off-by: H. Peter Anvin --- arch/x86/pci/Makefile | 3 +-- arch/x86/pci/amd_bus.c | 18 ++---------------- arch/x86/pci/bus_numa.h | 4 ++-- arch/x86/pci/i386.c | 4 ---- 4 files changed, 5 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 39fba37f702f..0b7d3e9593e1 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -14,8 +14,7 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o -obj-y += amd_bus.o -obj-$(CONFIG_X86_64) += bus_numa.o +obj-y += amd_bus.o bus_numa.o ifeq ($(CONFIG_PCI_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index f7e13b63154e..ea6072fcf3d4 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -6,9 +6,7 @@ #include -#ifdef CONFIG_X86_64 #include -#endif #include "bus_numa.h" @@ -17,8 +15,6 @@ * also get peer root bus resource for io,mmio */ -#ifdef CONFIG_X86_64 - struct pci_hostbridge_probe { u32 bus; u32 slot; @@ -339,24 +335,14 @@ static int __init early_fill_mp_bus_info(void) info->bus_min, info->bus_max, info->node, info->link); for (j = 0; j < res_num; j++) { res = &info->res[j]; - printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", - busnum, j, - (res->flags & IORESOURCE_IO)?"io port":"mmio", - res->start, res->end); + printk(KERN_DEBUG "bus: %02x index %x %pR\n", + busnum, j, res); } } return 0; } -#else /* !CONFIG_X86_64 */ - -static int __init early_fill_mp_bus_info(void) { return 0; } - -#endif /* !CONFIG_X86_64 */ - -/* common 32/64 bit code */ - #define ENABLE_CF8_EXT_CFG (1ULL << 46) static void enable_pci_io_ecs(void *unused) diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index f63e80294f22..08d8e1576240 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -1,5 +1,5 @@ -#ifdef CONFIG_X86_64 - +#ifndef __BUS_NUMA_H +#define __BUS_NUMA_H /* * sub bus (transparent) will use entres from 3 to store extra from * root, so need to make sure we have enough slot there, Should we diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 5dc9e8c63fcd..f4e8481970bd 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -257,10 +257,6 @@ void __init pcibios_resource_survey(void) */ fs_initcall(pcibios_assign_resources); -void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b) -{ -} - /* * If we set up a device for bus mastering, we need to check the latency * timer as certain crappy BIOSes forget to set it properly. -- cgit v1.2.2 From e9a0064ad03b899938059bb576615ad9ed0f27f9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:13 -0800 Subject: x86: Change range end to start+size So make interface more consistent with early_res. Later we can share some code with early_res. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-10-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 32 ++++++++++++++++---------------- arch/x86/pci/amd_bus.c | 24 ++++++++++++++---------- 2 files changed, 30 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 669da09ab9a8..06130b52f012 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -78,13 +78,13 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, base = range_state[i].base_pfn; size = range_state[i].size_pfn; nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, - base, base + size - 1); + base, base + size); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } /* Take out UC ranges: */ @@ -106,11 +106,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, RANGE_NUM, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size); } if (extra_remove_size) subtract_range(range, RANGE_NUM, extra_remove_base, - extra_remove_base + extra_remove_size - 1); + extra_remove_base + extra_remove_size); if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); @@ -118,7 +118,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, if (!range[i].end) continue; printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } } @@ -128,7 +128,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } return nr_range; @@ -142,7 +142,7 @@ static unsigned long __init sum_ranges(struct range *range, int nr_range) int i; for (i = 0; i < nr_range; i++) - sum += range[i].end + 1 - range[i].start; + sum += range[i].end - range[i].start; return sum; } @@ -489,7 +489,7 @@ x86_setup_var_mtrrs(struct range *range, int nr_range, /* Write the range: */ for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, - range[i].end - range[i].start + 1); + range[i].end - range[i].start); } /* Write the last range: */ @@ -720,7 +720,7 @@ int __init mtrr_cleanup(unsigned address_bits) * and fixed mtrrs should take effect before var mtrr for it: */ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, - (1ULL<<(20 - PAGE_SHIFT)) - 1); + 1ULL<<(20 - PAGE_SHIFT)); /* Sort the ranges: */ sort_range(range, nr_range); @@ -939,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) nr_range = 0; if (mtrr_tom2) { range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); - range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; - if (highest_pfn < range[nr_range].end + 1) - highest_pfn = range[nr_range].end + 1; + range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT; + if (highest_pfn < range[nr_range].end) + highest_pfn = range[nr_range].end; nr_range++; } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); @@ -953,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { - if (range[i].end + 1 < range[i+1].start) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end, range[i+1].start); } /* Check the top: */ i = nr_range - 1; - if (range[i].end + 1 < end_pfn) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < end_pfn) + total_trim_size += real_trim_memory(range[i].end, end_pfn); if (total_trim_size) { diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index ea6072fcf3d4..fc1e8fe07e5c 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -145,7 +145,7 @@ static int __init early_fill_mp_bus_info(void) def_link = (reg >> 8) & 0x03; memset(range, 0, sizeof(range)); - range[0].end = 0xffff; + add_range(range, RANGE_NUM, 0, 0, 0xffff + 1); /* io port resource */ for (i = 0; i < 4; i++) { reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); @@ -175,7 +175,7 @@ static int __init early_fill_mp_bus_info(void) if (end > 0xffff) end = 0xffff; update_res(info, start, end, IORESOURCE_IO, 1); - subtract_range(range, RANGE_NUM, start, end); + subtract_range(range, RANGE_NUM, start, end + 1); } /* add left over io port range to def node/link, [0, 0xffff] */ /* find the position */ @@ -190,14 +190,16 @@ static int __init early_fill_mp_bus_info(void) if (!range[i].end) continue; - update_res(info, range[i].start, range[i].end, + update_res(info, range[i].start, range[i].end - 1, IORESOURCE_IO, 1); } } memset(range, 0, sizeof(range)); /* 0xfd00000000-0xffffffffff for HT */ - range[0].end = cap_resource((0xfdULL<<32) - 1); + end = cap_resource((0xfdULL<<32) - 1); + end++; + add_range(range, RANGE_NUM, 0, 0, end); /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; @@ -205,14 +207,15 @@ static int __init early_fill_mp_bus_info(void) end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20); if (end < (1ULL<<32)) - subtract_range(range, RANGE_NUM, 0, end - 1); + subtract_range(range, RANGE_NUM, 0, end); /* get mmconfig */ get_pci_mmcfg_amd_fam10h_range(); /* need to take out mmconf range */ if (fam10h_mmconf_end) { printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); - subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end); + subtract_range(range, RANGE_NUM, fam10h_mmconf_start, + fam10h_mmconf_end + 1); } /* mmio resource */ @@ -267,7 +270,8 @@ static int __init early_fill_mp_bus_info(void) /* we got a hole */ endx = fam10h_mmconf_start - 1; update_res(info, start, endx, IORESOURCE_MEM, 0); - subtract_range(range, RANGE_NUM, start, endx); + subtract_range(range, RANGE_NUM, start, + endx + 1); printk(KERN_CONT " ==> [%llx, %llx]", start, endx); start = fam10h_mmconf_end + 1; changed = 1; @@ -284,7 +288,7 @@ static int __init early_fill_mp_bus_info(void) update_res(info, cap_resource(start), cap_resource(end), IORESOURCE_MEM, 1); - subtract_range(range, RANGE_NUM, start, end); + subtract_range(range, RANGE_NUM, start, end + 1); printk(KERN_CONT "\n"); } @@ -299,7 +303,7 @@ static int __init early_fill_mp_bus_info(void) rdmsrl(address, val); end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20); - subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1); + subtract_range(range, RANGE_NUM, 1ULL<<32, end); } /* @@ -319,7 +323,7 @@ static int __init early_fill_mp_bus_info(void) continue; update_res(info, cap_resource(range[i].start), - cap_resource(range[i].end), + cap_resource(range[i].end - 1), IORESOURCE_MEM, 1); } } -- cgit v1.2.2 From 79c601695870ca2a9c0ba9949a97d2be78ec07b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:14 -0800 Subject: x86: Print out RAM buffer information So we can check that early in the bootlog. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-11-git-send-email-yinghai@kernel.org> Reviewed-by: Christoph Lameter Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a966b753e496..f406efeb4dc4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1429,6 +1429,8 @@ void __init e820_reserve_resources_late(void) end = MAX_RESOURCE_SIZE; if (start >= end) continue; + printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", + start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } -- cgit v1.2.2 From 1842f90cc98625d4d9bf8f8b927f17705ceb4e9c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:15 -0800 Subject: x86: Call early_res_to_bootmem one time Simplify setup_node_mem: don't use bootmem from other node, instead just find_e820_area in early_node_mem. This keeps the boundary between early_res and boot mem more clear, and lets us only call early_res_to_bootmem() one time instead of for all nodes. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-12-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 1 + arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 3 +-- arch/x86/mm/numa_64.c | 62 ++++++++++++++++--------------------------------- 4 files changed, 22 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3499b4fabc94..48cadbb1d28b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -967,6 +967,7 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); + early_res_to_bootmem(0, max_low_pfn<> PAGE_SHIFT, 0, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); - early_res_to_bootmem(0, end_pfn<bdata = &bootmem_node_data[nodeid]; @@ -227,11 +234,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) * of alloc_bootmem, that could clash with reserved range */ bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); - nid = phys_to_nid(nodedata_phys); - if (nid == nodeid) - bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); - else - bootmap_start = roundup(start, PAGE_SIZE); + bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); /* * SMP_CACHE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE @@ -239,18 +242,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<= end) { - /* - * only need to free it if it is from other node - * bootmem - */ - if (nid != nodeid) - free_bootmem(nodedata_phys, pgdat_size); - } + free_early(nodedata_phys, nodedata_phys + pgdat_size); node_data[nodeid] = NULL; return; } bootmap_start = __pa(bootmap); + reserve_early(bootmap_start, bootmap_start+(bootmap_pages<> PAGE_SHIFT, @@ -259,31 +257,11 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", bootmap_start, bootmap_start + bootmap_size - 1, bootmap_pages); - - free_bootmem_with_active_regions(nodeid, end); - - /* - * convert early reserve to bootmem reserve earlier - * otherwise early_node_mem could use early reserved mem - * on previous node - */ - early_res_to_bootmem(start, end); - - /* - * in some case early_node_mem could use alloc_bootmem - * to get range on other node, don't reserve that again - */ - if (nid != nodeid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); - else - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, - pgdat_size, BOOTMEM_DEFAULT); nid = phys_to_nid(bootmap_start); if (nid != nodeid) printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); - else - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, - bootmap_pages< Date: Wed, 10 Feb 2010 01:20:16 -0800 Subject: x86: Introduce max_early_res and early_res_count To prepare allocate early res array from fine_e820_area. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-13-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f406efeb4dc4..7053f4adb8ed 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -732,14 +732,18 @@ core_initcall(e820_mark_nvs_memory); /* * Early reserved memory areas. */ -#define MAX_EARLY_RES 32 +/* + * need to make sure this one is bigger enough before + * find_e820_area could be used + */ +#define MAX_EARLY_RES_X 32 struct early_res { u64 start, end; - char name[16]; + char name[15]; char overlap_ok; }; -static struct early_res early_res[MAX_EARLY_RES] __initdata = { +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = { { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ #if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) /* @@ -753,12 +757,22 @@ static struct early_res early_res[MAX_EARLY_RES] __initdata = { {} }; +static int max_early_res __initdata = MAX_EARLY_RES_X; +static struct early_res *early_res __initdata = &early_res_x[0]; +static int early_res_count __initdata = +#ifdef CONFIG_X86_32 + 2 +#else + 1 +#endif + ; + static int __init find_overlapped_early(u64 start, u64 end) { int i; struct early_res *r; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { r = &early_res[i]; if (end > r->start && start < r->end) break; @@ -776,13 +790,14 @@ static void __init drop_range(int i) { int j; - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + for (j = i + 1; j < max_early_res && early_res[j].end; j++) ; memmove(&early_res[i], &early_res[i + 1], (j - 1 - i) * sizeof(struct early_res)); early_res[j - 1].end = 0; + early_res_count--; } /* @@ -801,9 +816,9 @@ static void __init drop_overlaps_that_are_ok(u64 start, u64 end) struct early_res *r; u64 lower_start, lower_end; u64 upper_start, upper_end; - char name[16]; + char name[15]; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { r = &early_res[i]; /* Continue past non-overlapping ranges */ @@ -859,7 +874,7 @@ static void __init __reserve_early(u64 start, u64 end, char *name, struct early_res *r; i = find_overlapped_early(start, end); - if (i >= MAX_EARLY_RES) + if (i >= max_early_res) panic("Too many early reservations"); r = &early_res[i]; if (r->end) @@ -872,6 +887,7 @@ static void __init __reserve_early(u64 start, u64 end, char *name, r->overlap_ok = overlap_ok; if (name) strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; } /* @@ -924,7 +940,7 @@ void __init free_early(u64 start, u64 end) i = find_overlapped_early(start, end); r = &early_res[i]; - if (i >= MAX_EARLY_RES || r->end != end || r->start != start) + if (i >= max_early_res || r->end != end || r->start != start) panic("free_early on not reserved area: %llx-%llx!", start, end - 1); @@ -935,14 +951,15 @@ void __init early_res_to_bootmem(u64 start, u64 end) { int i, count; u64 final_start, final_end; + int idx = 0; count = 0; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) + for (i = 0; i < max_early_res && early_res[i].end; i++) count++; - printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count, start, end); - for (i = 0; i < count; i++) { + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count - idx, max_early_res, start, end); + for (i = idx; i < count; i++) { struct early_res *r = &early_res[i]; printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, r->start, r->end, r->name); @@ -969,7 +986,7 @@ static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) again: i = find_overlapped_early(addr, addr + size); r = &early_res[i]; - if (i < MAX_EARLY_RES && r->end) { + if (i < max_early_res && r->end) { *addrp = addr = round_up(r->end, align); changed = 1; goto again; @@ -986,7 +1003,7 @@ static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) int changed = 0; again: last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { struct early_res *r = &early_res[i]; if (last > r->start && addr < r->start) { size = r->start - addr; -- cgit v1.2.2 From 28b1c57d3c1f8df69c958f2ae7b9e4b67538ff4d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:17 -0800 Subject: x86: Dynamically increase early_res array size Use early_res_count to track the num, and use find_e820 to get a new buffer, then copy from the old to the new one. Also, clear early_res to prevent later invalid usage. -v2 _check_and_double_early_res should take new start Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-14-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7053f4adb8ed..e09c18c8f3c1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -916,6 +916,48 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } +static void __init __check_and_double_early_res(u64 start) +{ + u64 end, size, mem; + struct early_res *new; + + /* do we have enough slots left ? */ + if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) + return; + + /* double it */ + end = max_pfn_mapped << PAGE_SHIFT; + size = sizeof(struct early_res) * max_early_res * 2; + mem = find_e820_area(start, end, size, sizeof(struct early_res)); + + if (mem == -1ULL) + panic("can not find more space for early_res array"); + + new = __va(mem); + /* save the first one for own */ + new[0].start = mem; + new[0].end = mem + size; + new[0].overlap_ok = 0; + /* copy old to new */ + if (early_res == early_res_x) { + memcpy(&new[1], &early_res[0], + sizeof(struct early_res) * max_early_res); + memset(&new[max_early_res+1], 0, + sizeof(struct early_res) * (max_early_res - 1)); + early_res_count++; + } else { + memcpy(&new[1], &early_res[1], + sizeof(struct early_res) * (max_early_res - 1)); + memset(&new[max_early_res], 0, + sizeof(struct early_res) * max_early_res); + } + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = new; + max_early_res *= 2; + printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", + max_early_res, mem, mem + size - 1); +} + /* * Most early reservations come here. * @@ -929,6 +971,8 @@ void __init reserve_early(u64 start, u64 end, char *name) if (start >= end) return; + __check_and_double_early_res(end); + drop_overlaps_that_are_ok(start, end); __reserve_early(start, end, name, 0); } @@ -957,6 +1001,10 @@ void __init early_res_to_bootmem(u64 start, u64 end) for (i = 0; i < max_early_res && early_res[i].end; i++) count++; + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", count - idx, max_early_res, start, end); for (i = idx; i < count; i++) { @@ -974,6 +1022,11 @@ void __init early_res_to_bootmem(u64 start, u64 end) reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); } + /* clear them */ + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + early_res_count = 0; } /* Check for already reserved areas */ -- cgit v1.2.2 From cef625eef8b4cd573a9f6a17861d34226aebf6c2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:18 -0800 Subject: x86: Make early_node_mem get mem > 4 GB if possible So we could put pgdata for the node high, and later sparse vmmap will get the section nr that need. With this patch will make <4 GB ram not use a sparse vmmap. before this patch, will get, before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [10 - 96] [ 0.000000] free [b12 - 1000] [ 0.000000] free [359f - 38a3] [ 0.000000] free [38b5 - 3a00] [ 0.000000] free [41e01 - 42000] [ 0.000000] free [73dde - 73e00] [ 0.000000] free [73fdd - 74000] [ 0.000000] free [741dd - 74200] [ 0.000000] free [743dd - 74400] [ 0.000000] free [745dd - 74600] [ 0.000000] free [747dd - 74800] [ 0.000000] free [749dd - 74a00] [ 0.000000] free [74bdd - 74c00] [ 0.000000] free [74ddd - 74e00] [ 0.000000] free [74fdd - 75000] [ 0.000000] free [751dd - 75200] [ 0.000000] free [753dd - 75400] [ 0.000000] free [755dd - 75600] [ 0.000000] free [757dd - 75800] [ 0.000000] free [759dd - 75a00] [ 0.000000] free [75bdd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100000 - 2080000] [ 0.000000] total free 1f87170 [ 93.301474] Placing 64MB software IO TLB between ffff880075bdd000 - ffff880079bdd000 [ 93.311814] software IO TLB at phys 0x75bdd000 - 0x79bdd000 with this patch will get: before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [a - 96] [ 0.000000] free [702 - 1000] [ 0.000000] free [359f - 3600] [ 0.000000] free [37de - 3800] [ 0.000000] free [39dd - 3a00] [ 0.000000] free [3bdd - 3c00] [ 0.000000] free [3ddd - 3e00] [ 0.000000] free [3fdd - 4000] [ 0.000000] free [41dd - 4200] [ 0.000000] free [43dd - 4400] [ 0.000000] free [45dd - 4600] [ 0.000000] free [47dd - 4800] [ 0.000000] free [49dd - 4a00] [ 0.000000] free [4bdd - 4c00] [ 0.000000] free [4ddd - 4e00] [ 0.000000] free [4fdd - 5000] [ 0.000000] free [51dd - 5200] [ 0.000000] free [53dd - 5400] [ 0.000000] free [55dd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100428 - 100600] [ 0.000000] free [13ea01 - 13ec00] [ 0.000000] free [170800 - 2080000] [ 0.000000] total free 1f87170 [ 92.689485] PCI-DMA: Using software bounce buffering for IO (SWIOTLB) [ 92.699799] Placing 64MB software IO TLB between ffff8800055dd000 - ffff8800095dd000 [ 92.710916] software IO TLB at phys 0x55dd000 - 0x95dd000 so will get enough space below 4G, aka pfn 0x100000 Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-15-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3232148756ce..02f13cb99bc2 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -163,14 +163,27 @@ static void * __init early_node_mem(int nodeid, unsigned long start, unsigned long end, unsigned long size, unsigned long align) { - unsigned long mem = find_e820_area(start, end, size, align); + unsigned long mem; + /* + * put it on high as possible + * something will go with NODE_DATA + */ + if (start < (MAX_DMA_PFN< (MAX_DMA32_PFN< (MAX_DMA32_PFN< Date: Wed, 10 Feb 2010 01:20:19 -0800 Subject: x86: Only call dma32_reserve_bootmem 64bit !CONFIG_NUMA 64bit NUMA already make enough space under 4G with new early_node_mem. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-16-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pci.h | 2 ++ arch/x86/include/asm/pci_64.h | 2 -- arch/x86/kernel/pci-dma.c | 13 ++++++++++--- arch/x86/kernel/setup.c | 7 ------- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index ada8c201d513..b4a00dd4eed5 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -124,6 +124,8 @@ extern void pci_iommu_alloc(void); #include "pci_64.h" #endif +void dma32_reserve_bootmem(void); + /* implement the pci_ DMA API in terms of the generic device dma_ one */ #include diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h index ae5e40f67daf..fe15cfb21b9b 100644 --- a/arch/x86/include/asm/pci_64.h +++ b/arch/x86/include/asm/pci_64.h @@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn, extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); -extern void dma32_reserve_bootmem(void); - #endif /* __KERNEL__ */ #endif /* _ASM_X86_PCI_64_H */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 75e14e21f61a..1aa966c565f9 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -65,7 +65,7 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA) static __initdata void *dma32_bootmem_ptr; static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); @@ -116,14 +116,21 @@ static void __init dma32_free_bootmem(void) dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; } +#else +void __init dma32_reserve_bootmem(void) +{ +} +static void __init dma32_free_bootmem(void) +{ +} + #endif void __init pci_iommu_alloc(void) { -#ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); -#endif + if (pci_swiotlb_detect()) goto out; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 48cadbb1d28b..ea4141b48518 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -969,14 +969,7 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn, acpi, k8); early_res_to_bootmem(0, max_low_pfn< Date: Thu, 11 Feb 2010 11:50:59 -0800 Subject: x86, ptrace: regset extensions to support xstate Add the xstate regset support which helps extend the kernel ptrace and the core-dump interfaces to support AVX state etc. This regset interface is designed to support all the future state that gets supported using xsave/xrstor infrastructure. Looking at the memory layout saved by "xsave", one can't say which state is represented in the memory layout. This is because if a particular state is in init state, in the xsave hdr it can be represented by bit '0'. And hence we can't really say by the xsave header wether a state is in init state or the state is not saved in the memory layout. And hence the xsave memory layout available through this regset interface uses SW usable bytes [464..511] to convey what state is represented in the memory layout. First 8 bytes of the sw_usable_bytes[464..467] will be set to OS enabled xstate mask(which is same as the 64bit mask returned by the xgetbv's xCR0). The note NT_X86_XSTATE represents the extended state information in the core file, using the above mentioned memory layout. Signed-off-by: Suresh Siddha LKML-Reference: <20100211195614.802495327@sbs-t61.sc.intel.com> Signed-off-by: Hongjiu Lu Cc: Roland McGrath Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 12 +++++-- arch/x86/include/asm/user.h | 58 +++++++++++++++++++++++++++++++ arch/x86/include/asm/xsave.h | 2 ++ arch/x86/kernel/i387.c | 83 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 34 ++++++++++++++++-- arch/x86/kernel/xsave.c | 1 + 6 files changed, 186 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ebfb8a9e11f7..da2930924501 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -33,8 +33,16 @@ extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active extern struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index 999873b22e7f..24532c7da3d6 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -1,5 +1,63 @@ +#ifndef _ASM_X86_USER_H +#define _ASM_X86_USER_H + #ifdef CONFIG_X86_32 # include "user_32.h" #else # include "user_64.h" #endif + +#include + +struct user_ymmh_regs { + /* 16 * 16 bytes for each YMMH-reg */ + __u32 ymmh_space[64]; +}; + +struct user_xsave_hdr { + __u64 xstate_bv; + __u64 reserved1[2]; + __u64 reserved2[5]; +}; + +/* + * The structure layout of user_xstateregs, used for exporting the + * extended register state through ptrace and core-dump (NT_X86_XSTATE note) + * interfaces will be same as the memory layout of xsave used by the processor + * (except for the bytes 464..511, which can be used by the software) and hence + * the size of this structure varies depending on the features supported by the + * processor and OS. The size of the structure that users need to use can be + * obtained by doing: + * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx); + * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.) + * need to use. + * + * For now, only the first 8 bytes of the software usable bytes[464..471] will + * be used and will be set to OS enabled xstate mask (which is same as the + * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump + * remotely, etc.) can use this mask as well as the mask saved in the + * xstate_hdr bytes and interpret what states the processor/OS supports + * and what states are in modified/initialized conditions for the + * particular process/thread. + * + * Also when the user modifies certain state FP/SSE/etc through the + * ptrace interface, they must ensure that the xsave_hdr.xstate_bv + * bytes[512..519] of the memory layout are updated correspondingly. + * i.e., for example when FP state is modified to a non-init state, + * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to + * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc. + */ +#define USER_XSTATE_FX_SW_WORDS 6 +#define USER_XSTATE_XCR0_WORD 0 + +struct user_xstateregs { + struct { + __u64 fpx_space[58]; + __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS]; + } i387; + struct user_xsave_hdr xsave_hdr; + struct user_ymmh_regs ymmh; + /* further processor state extensions go here */ +}; + +#endif /* _ASM_X86_USER_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 727acc152344..ddc04ccad03b 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -27,9 +27,11 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; extern struct xsave_struct *init_xstate_buf; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void xsave_cntxt_init(void); extern void xsave_init(void); +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); extern int check_for_xstate(struct i387_fxsave_struct __user *buf, void __user *fpstate, diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f2f8540a7f3d..7a8a193b5144 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -164,6 +164,11 @@ int init_fpu(struct task_struct *tsk) return 0; } +/* + * The xstateregs_active() routine is the same as the fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ int fpregs_active(struct task_struct *target, const struct user_regset *regset) { return tsk_used_math(target) ? regset->n : 0; @@ -224,6 +229,84 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, return ret; } +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + /* + * First copy the fxsave bytes 0..463. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, + offsetof(struct user_xstateregs, + i387.xstate_fx_sw)); + if (ret) + return ret; + + /* + * Copy the 48bytes defined by software. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + xstate_fx_sw_bytes, + offsetof(struct user_xstateregs, + i387.xstate_fx_sw), + offsetof(struct user_xstateregs, + xsave_hdr)); + if (ret) + return ret; + + /* + * Copy the rest of xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave.xsave_hdr, + offsetof(struct user_xstateregs, + xsave_hdr), -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct xsave_hdr_struct *xsave_hdr; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + + xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + return ret; +} + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..16433a59b396 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -48,6 +48,7 @@ enum x86_regset { REGSET_FP, REGSET_XFP, REGSET_IOPERM64 = REGSET_XFP, + REGSET_XSTATE, REGSET_TLS, REGSET_IOPERM32, }; @@ -1584,7 +1585,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static const struct user_regset x86_64_regsets[] = { +static struct user_regset x86_64_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1597,6 +1598,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_IOPERM64] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, @@ -1622,7 +1629,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static const struct user_regset x86_32_regsets[] = { +static struct user_regset x86_32_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1641,6 +1648,12 @@ static const struct user_regset x86_32_regsets[] = { .size = sizeof(u32), .align = sizeof(u32), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, @@ -1663,6 +1676,23 @@ static const struct user_regset_view user_x86_32_view = { }; #endif +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index c5ee17e8c6d9..782c3a362ec6 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void) cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; + update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); setup_xstate_init(); -- cgit v1.2.2 From 08677214e318297f228237be0042aac754f48f1d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:20 -0800 Subject: x86: Make 64 bit use early_res instead of bootmem before slab Finally we can use early_res to replace bootmem for x86_64 now. Still can use CONFIG_NO_BOOTMEM to enable it or not. -v2: fix 32bit compiling about MAX_DMA32_PFN -v3: folded bug fix from LKML message below Signed-off-by: Yinghai Lu LKML-Reference: <4B747239.4070907@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 13 ++++ arch/x86/include/asm/e820.h | 6 ++ arch/x86/kernel/e820.c | 159 ++++++++++++++++++++++++++++++++++++++++---- arch/x86/kernel/setup.c | 2 + arch/x86/mm/init_64.c | 4 ++ arch/x86/mm/numa_64.c | 20 ++++-- 6 files changed, 186 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb4092568f9e..95439843cebc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -568,6 +568,19 @@ config PARAVIRT_DEBUG Enable to debug paravirt_ops internals. Specifically, BUG if a paravirt_op is missing when it is called. +config NO_BOOTMEM + default y + bool "Disable Bootmem code" + depends on X86_64 + ---help--- + Use early_res directly instead of bootmem before slab is ready. + - allocator (buddy) [generic] + - early allocator (bootmem) [generic] + - very early allocator (reserve_early*()) [x86] + - very very early allocator (early brk model) [x86] + So reduce one layer between early allocator to final allocator + + config MEMTEST bool "Memtest" ---help--- diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 761249e396fe..7d72e5fb7008 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -117,6 +117,12 @@ extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); +void reserve_early_without_check(u64 start, u64 end, char *name); +u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align); +#include +int get_free_all_memory_range(struct range **rangep, int nodeid); + extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); extern int e820_find_active_region(const struct e820entry *ei, diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e09c18c8f3c1..90a85295f332 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -977,6 +977,25 @@ void __init reserve_early(u64 start, u64 end, char *name) __reserve_early(start, end, name, 0); } +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + void __init free_early(u64 start, u64 end) { struct early_res *r; @@ -991,6 +1010,94 @@ void __init free_early(u64 start, u64 end) drop_range(i); } +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#if 1 + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if 0 + printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) { +#if 0 + printk(KERN_CONT "\n"); +#endif + continue; + } +#if 0 + printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n", + final_start, final_end); +#endif + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; +#ifdef MAX_DMA32_PFN + if (max_pfn_mapped > MAX_DMA32_PFN) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + end = max_pfn_mapped << PAGE_SHIFT; + mem = find_e820_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else void __init early_res_to_bootmem(u64 start, u64 end) { int i, count; @@ -1028,6 +1135,7 @@ void __init early_res_to_bootmem(u64 start, u64 end) max_early_res = 0; early_res_count = 0; } +#endif /* Check for already reserved areas */ static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) @@ -1081,6 +1189,35 @@ again: return changed; } +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + /* * Find a free area with specified alignment in a specific range. */ @@ -1090,24 +1227,20 @@ u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + u64 addr; + u64 ei_start, ei_last; if (ei->type != E820_RAM) continue; - addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - continue; - if (last > end) + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr == -1ULL) continue; + return addr; } return -1ULL; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ea4141b48518..d49e168bda8c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -967,7 +967,9 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); +#ifndef CONFIG_NO_BOOTMEM early_res_to_bootmem(0, max_low_pfn<> PAGE_SHIFT; @@ -235,10 +237,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; + NODE_DATA(nodeid)->node_id = nodeid; NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; +#ifndef CONFIG_NO_BOOTMEM + NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; + /* * Find a place for the bootmem map * nodedata_phys could be on other nodes by alloc_bootmem, @@ -275,6 +280,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); free_bootmem_with_active_regions(nodeid, end); +#endif node_set_online(nodeid); } @@ -733,6 +739,10 @@ unsigned long __init numa_free_all_bootmem(void) for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); +#ifdef CONFIG_NO_BOOTMEM + pages += free_all_memory_core_early(MAX_NUMNODES); +#endif + return pages; } -- cgit v1.2.2 From 9bdac914240759457175ac0d6529a37d2820bc4d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:22 -0800 Subject: sparsemem: Put mem map for one node together. Add vmemmap_alloc_block_buf for mem map only. It will fallback to the old way if it cannot get a block that big. Before this patch, when a node have 128g ram installed, memmap are split into two parts or more. [ 0.000000] [ffffea0000000000-ffffea003fffffff] PMD -> [ffff880100600000-ffff88013e9fffff] on node 1 [ 0.000000] [ffffea0040000000-ffffea006fffffff] PMD -> [ffff88013ec00000-ffff88016ebfffff] on node 1 [ 0.000000] [ffffea0070000000-ffffea007fffffff] PMD -> [ffff882000600000-ffff8820105fffff] on node 0 [ 0.000000] [ffffea0080000000-ffffea00bfffffff] PMD -> [ffff882010800000-ffff8820507fffff] on node 0 [ 0.000000] [ffffea00c0000000-ffffea00dfffffff] PMD -> [ffff882050a00000-ffff8820709fffff] on node 0 [ 0.000000] [ffffea00e0000000-ffffea00ffffffff] PMD -> [ffff884000600000-ffff8840205fffff] on node 2 [ 0.000000] [ffffea0100000000-ffffea013fffffff] PMD -> [ffff884020800000-ffff8840607fffff] on node 2 [ 0.000000] [ffffea0140000000-ffffea014fffffff] PMD -> [ffff884060a00000-ffff8840709fffff] on node 2 [ 0.000000] [ffffea0150000000-ffffea017fffffff] PMD -> [ffff886000600000-ffff8860305fffff] on node 3 [ 0.000000] [ffffea0180000000-ffffea01bfffffff] PMD -> [ffff886030800000-ffff8860707fffff] on node 3 [ 0.000000] [ffffea01c0000000-ffffea01ffffffff] PMD -> [ffff888000600000-ffff8880405fffff] on node 4 [ 0.000000] [ffffea0200000000-ffffea022fffffff] PMD -> [ffff888040800000-ffff8880707fffff] on node 4 [ 0.000000] [ffffea0230000000-ffffea023fffffff] PMD -> [ffff88a000600000-ffff88a0105fffff] on node 5 [ 0.000000] [ffffea0240000000-ffffea027fffffff] PMD -> [ffff88a010800000-ffff88a0507fffff] on node 5 [ 0.000000] [ffffea0280000000-ffffea029fffffff] PMD -> [ffff88a050a00000-ffff88a0709fffff] on node 5 [ 0.000000] [ffffea02a0000000-ffffea02bfffffff] PMD -> [ffff88c000600000-ffff88c0205fffff] on node 6 [ 0.000000] [ffffea02c0000000-ffffea02ffffffff] PMD -> [ffff88c020800000-ffff88c0607fffff] on node 6 [ 0.000000] [ffffea0300000000-ffffea030fffffff] PMD -> [ffff88c060a00000-ffff88c0709fffff] on node 6 [ 0.000000] [ffffea0310000000-ffffea033fffffff] PMD -> [ffff88e000600000-ffff88e0305fffff] on node 7 [ 0.000000] [ffffea0340000000-ffffea037fffffff] PMD -> [ffff88e030800000-ffff88e0707fffff] on node 7 after patch will get [ 0.000000] [ffffea0000000000-ffffea006fffffff] PMD -> [ffff880100200000-ffff88016e5fffff] on node 0 [ 0.000000] [ffffea0070000000-ffffea00dfffffff] PMD -> [ffff882000200000-ffff8820701fffff] on node 1 [ 0.000000] [ffffea00e0000000-ffffea014fffffff] PMD -> [ffff884000200000-ffff8840701fffff] on node 2 [ 0.000000] [ffffea0150000000-ffffea01bfffffff] PMD -> [ffff886000200000-ffff8860701fffff] on node 3 [ 0.000000] [ffffea01c0000000-ffffea022fffffff] PMD -> [ffff888000200000-ffff8880701fffff] on node 4 [ 0.000000] [ffffea0230000000-ffffea029fffffff] PMD -> [ffff88a000200000-ffff88a0701fffff] on node 5 [ 0.000000] [ffffea02a0000000-ffffea030fffffff] PMD -> [ffff88c000200000-ffff88c0701fffff] on node 6 [ 0.000000] [ffffea0310000000-ffffea037fffffff] PMD -> [ffff88e000200000-ffff88e0701fffff] on node 7 -v2: change buf to vmemmap_buf instead according to Ingo also add CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER according to Ingo -v3: according to Andrew, use sizeof(name) instead of hard coded 15 Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-19-git-send-email-yinghai@kernel.org> Cc: Christoph Lameter Acked-by: Christoph Lameter Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 53158b7e5d46..e9b040e1cde5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -977,7 +977,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) if (pmd_none(*pmd)) { pte_t entry; - p = vmemmap_alloc_block(PMD_SIZE, node); + p = vmemmap_alloc_block_buf(PMD_SIZE, node); if (!p) return -ENOMEM; -- cgit v1.2.2 From db8f77c889542b09457b8b97efb311343c99a75d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:23 -0800 Subject: x86: Move bios page reserve early to head32/64.c So prepare to make one more clean of early_res.c. -v2: don't need to reserve first page in early_res because we already mark that in e820 as reserved already. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-20-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 22 ++-------------------- arch/x86/kernel/head32.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 90a85295f332..4004f10285d1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -743,29 +743,11 @@ struct early_res { char name[15]; char overlap_ok; }; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = { - { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, -#endif - - {} -}; +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; static int max_early_res __initdata = MAX_EARLY_RES_X; static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata = -#ifdef CONFIG_X86_32 - 2 -#else - 1 -#endif - ; +static int early_res_count __initdata; static int __init find_overlapped_early(u64 start, u64 end) { diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 5051b94c9069..adedeef1dedc 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,6 +29,16 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { +#ifdef CONFIG_X86_TRAMPOLINE + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, + "EX TRAMPOLINE"); +#endif + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD -- cgit v1.2.2 From a678c2be75773e112f6d656a22a7f1645c4dbd6c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:24 -0800 Subject: x86: Separate early_res related code from e820.c ... to make e820.c smaller. -v2: fix 32bit compiling with MAX_DMA32_PFN Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-21-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/e820.h | 13 +- arch/x86/include/asm/early_res.h | 20 ++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 541 +-------------------------------------- arch/x86/kernel/early_res.c | 538 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 561 insertions(+), 553 deletions(-) create mode 100644 arch/x86/include/asm/early_res.h create mode 100644 arch/x86/kernel/early_res.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 7d72e5fb7008..efad699a2c22 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -109,19 +109,8 @@ static inline void early_memtest(unsigned long start, unsigned long end) extern unsigned long end_user_pfn; -extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); -extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); -extern void reserve_early(u64 start, u64 end, char *name); -extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); -extern void free_early(u64 start, u64 end); -extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); - -void reserve_early_without_check(u64 start, u64 end, char *name); -u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align); -#include -int get_free_all_memory_range(struct range **rangep, int nodeid); +#include extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h new file mode 100644 index 000000000000..2d43b166782d --- /dev/null +++ b/arch/x86/include/asm/early_res.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_EARLY_RES_H +#define _ASM_X86_EARLY_RES_H +#ifdef __KERNEL__ + +extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); +extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); +extern void reserve_early(u64 start, u64 end, char *name); +extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); +extern void free_early(u64 start, u64 end); +extern void early_res_to_bootmem(u64 start, u64 end); + +void reserve_early_without_check(u64 start, u64 end, char *name); +u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align); +#include +int get_free_all_memory_range(struct range **rangep, int nodeid); + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_EARLY_RES_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09bc5a52..f5fb9f0b6277 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -38,7 +38,7 @@ obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o -obj-y += bootflag.o e820.o +obj-y += bootflag.o e820.o early_res.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4004f10285d1..82db4015604e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -12,21 +12,14 @@ #include #include #include -#include -#include -#include -#include -#include #include #include #include -#include -#include #include +#include #include #include -#include /* * The e820 map is the map that gets modified e.g. with command line parameters @@ -729,538 +722,6 @@ static int __init e820_mark_nvs_memory(void) core_initcall(e820_mark_nvs_memory); #endif -/* - * Early reserved memory areas. - */ -/* - * need to make sure this one is bigger enough before - * find_e820_area could be used - */ -#define MAX_EARLY_RES_X 32 - -struct early_res { - u64 start, end; - char name[15]; - char overlap_ok; -}; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; - -static int max_early_res __initdata = MAX_EARLY_RES_X; -static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < max_early_res && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; - early_res_count--; -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. - */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) -{ - int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[15]; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) - continue; - - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= max_early_res) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name?name:"", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -static void __init __check_and_double_early_res(u64 start) -{ - u64 end, size, mem; - struct early_res *new; - - /* do we have enough slots left ? */ - if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) - return; - - /* double it */ - end = max_pfn_mapped << PAGE_SHIFT; - size = sizeof(struct early_res) * max_early_res * 2; - mem = find_e820_area(start, end, size, sizeof(struct early_res)); - - if (mem == -1ULL) - panic("can not find more space for early_res array"); - - new = __va(mem); - /* save the first one for own */ - new[0].start = mem; - new[0].end = mem + size; - new[0].overlap_ok = 0; - /* copy old to new */ - if (early_res == early_res_x) { - memcpy(&new[1], &early_res[0], - sizeof(struct early_res) * max_early_res); - memset(&new[max_early_res+1], 0, - sizeof(struct early_res) * (max_early_res - 1)); - early_res_count++; - } else { - memcpy(&new[1], &early_res[1], - sizeof(struct early_res) * (max_early_res - 1)); - memset(&new[max_early_res], 0, - sizeof(struct early_res) * max_early_res); - } - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = new; - max_early_res *= 2; - printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", - max_early_res, mem, mem + size - 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - __check_and_double_early_res(end); - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init reserve_early_without_check(u64 start, u64 end, char *name) -{ - struct early_res *r; - - if (start >= end) - return; - - __check_and_double_early_res(end); - - r = &early_res[early_res_count]; - - r->start = start; - r->end = end; - r->overlap_ok = 0; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= max_early_res || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -#ifdef CONFIG_NO_BOOTMEM -static void __init subtract_early_res(struct range *range, int az) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - -#if 1 - printk(KERN_INFO "Subtract (%d early reservations)\n", count); -#endif - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; -#if 0 - printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i, - r->start, r->end, r->name); -#endif - final_start = PFN_DOWN(r->start); - final_end = PFN_UP(r->end); - if (final_start >= final_end) { -#if 0 - printk(KERN_CONT "\n"); -#endif - continue; - } -#if 0 - printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n", - final_start, final_end); -#endif - subtract_range(range, az, final_start, final_end); - } - -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - int i, count; - u64 start = 0, end; - u64 size; - u64 mem; - struct range *range; - int nr_range; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - count *= 2; - - size = sizeof(struct range) * count; -#ifdef MAX_DMA32_PFN - if (max_pfn_mapped > MAX_DMA32_PFN) - start = MAX_DMA32_PFN << PAGE_SHIFT; -#endif - end = max_pfn_mapped << PAGE_SHIFT; - mem = find_e820_area(start, end, size, sizeof(struct range)); - if (mem == -1ULL) - panic("can not find more space for range free"); - - range = __va(mem); - /* use early_node_map[] and early_res to get range array at first */ - memset(range, 0, size); - nr_range = 0; - - /* need to go over early_node_map to find out good range for node */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); - subtract_early_res(range, count); - nr_range = clean_sort_range(range, count); - - /* need to clear it ? */ - if (nodeid == MAX_NUMNODES) { - memset(&early_res[0], 0, - sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - } - - *rangep = range; - return nr_range; -} -#else -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - - printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count - idx, max_early_res, start, end); - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } - /* clear them */ - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - early_res_count = 0; -} -#endif - -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < max_early_res && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) -{ - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < max_early_res && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - * only with the area.between start to end is active range from early_node_map - * so they are good as RAM - */ -u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - goto out; - if (last > end) - goto out; - - return addr; - -out: - return -1ULL; -} - -/* - * Find a free area with specified alignment in a specific range. - */ -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area(ei_start, ei_last, start, end, - size, align); - - if (addr == -1ULL) - continue; - - return addr; - } - return -1ULL; -} - -/* - * Find next free range after *start - */ -u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; - - if (ei->type != E820_RAM) - continue; - addr = round_up(ei->addr, align); - ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - continue; - return addr; - } - - return -1ULL; -} - /* * pre allocated 4k and reserved it in e820 */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c new file mode 100644 index 000000000000..1cf2c2f9ea68 --- /dev/null +++ b/arch/x86/kernel/early_res.c @@ -0,0 +1,538 @@ +/* + * early_res, could be used to replace bootmem + */ +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Early reserved memory areas. + */ +/* + * need to make sure this one is bigger enough before + * find_e820_area could be used + */ +#define MAX_EARLY_RES_X 32 + +struct early_res { + u64 start, end; + char name[15]; + char overlap_ok; +}; +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; + +static int max_early_res __initdata = MAX_EARLY_RES_X; +static struct early_res *early_res __initdata = &early_res_x[0]; +static int early_res_count __initdata; + +static int __init find_overlapped_early(u64 start, u64 end) +{ + int i; + struct early_res *r; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + break; + } + + return i; +} + +/* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and + * clearing what had been the last slot. + */ +static void __init drop_range(int i) +{ + int j; + + for (j = i + 1; j < max_early_res && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; + early_res_count--; +} + +/* + * Split any existing ranges that: + * 1) are marked 'overlap_ok', and + * 2) overlap with the stated range [start, end) + * into whatever portion (if any) of the existing range is entirely + * below or entirely above the stated range. Drop the portion + * of the existing range that overlaps with the stated range, + * which will allow the caller of this routine to then add that + * stated range without conflicting with any existing range. + */ +static void __init drop_overlaps_that_are_ok(u64 start, u64 end) +{ + int i; + struct early_res *r; + u64 lower_start, lower_end; + u64 upper_start, upper_end; + char name[15]; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + + /* Continue past non-overlapping ranges */ + if (end <= r->start || start >= r->end) + continue; + + /* + * Leave non-ok overlaps as is; let caller + * panic "Overlapping early reservations" + * when it hits this overlap. + */ + if (!r->overlap_ok) + return; + + /* + * We have an ok overlap. We will drop it from the early + * reservation map, and add back in any non-overlapping + * portions (lower or upper) as separate, overlap_ok, + * non-overlapping ranges. + */ + + /* 1. Note any non-overlapping (lower or upper) ranges. */ + strncpy(name, r->name, sizeof(name) - 1); + + lower_start = lower_end = 0; + upper_start = upper_end = 0; + if (r->start < start) { + lower_start = r->start; + lower_end = start; + } + if (r->end > end) { + upper_start = end; + upper_end = r->end; + } + + /* 2. Drop the original ok overlapping range */ + drop_range(i); + + i--; /* resume for-loop on copied down entry */ + + /* 3. Add back in any non-overlapping ranges. */ + if (lower_end) + reserve_early_overlap_ok(lower_start, lower_end, name); + if (upper_end) + reserve_early_overlap_ok(upper_start, upper_end, name); + } +} + +static void __init __reserve_early(u64 start, u64 end, char *name, + int overlap_ok) +{ + int i; + struct early_res *r; + + i = find_overlapped_early(start, end); + if (i >= max_early_res) + panic("Too many early reservations"); + r = &early_res[i]; + if (r->end) + panic("Overlapping early reservations " + "%llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name ? name : "", r->start, + r->end - 1, r->name); + r->start = start; + r->end = end; + r->overlap_ok = overlap_ok; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +/* + * A few early reservtations come here. + * + * The 'overlap_ok' in the name of this routine does -not- mean it + * is ok for these reservations to overlap an earlier reservation. + * Rather it means that it is ok for subsequent reservations to + * overlap this one. + * + * Use this entry point to reserve early ranges when you are doing + * so out of "Paranoia", reserving perhaps more memory than you need, + * just in case, and don't mind a subsequent overlapping reservation + * that is known to be needed. + * + * The drop_overlaps_that_are_ok() call here isn't really needed. + * It would be needed if we had two colliding 'overlap_ok' + * reservations, so that the second such would not panic on the + * overlap with the first. We don't have any such as of this + * writing, but might as well tolerate such if it happens in + * the future. + */ +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) +{ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 1); +} + +static void __init __check_and_double_early_res(u64 start) +{ + u64 end, size, mem; + struct early_res *new; + + /* do we have enough slots left ? */ + if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) + return; + + /* double it */ + end = max_pfn_mapped << PAGE_SHIFT; + size = sizeof(struct early_res) * max_early_res * 2; + mem = find_e820_area(start, end, size, sizeof(struct early_res)); + + if (mem == -1ULL) + panic("can not find more space for early_res array"); + + new = __va(mem); + /* save the first one for own */ + new[0].start = mem; + new[0].end = mem + size; + new[0].overlap_ok = 0; + /* copy old to new */ + if (early_res == early_res_x) { + memcpy(&new[1], &early_res[0], + sizeof(struct early_res) * max_early_res); + memset(&new[max_early_res+1], 0, + sizeof(struct early_res) * (max_early_res - 1)); + early_res_count++; + } else { + memcpy(&new[1], &early_res[1], + sizeof(struct early_res) * (max_early_res - 1)); + memset(&new[max_early_res], 0, + sizeof(struct early_res) * max_early_res); + } + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = new; + max_early_res *= 2; + printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", + max_early_res, mem, mem + size - 1); +} + +/* + * Most early reservations come here. + * + * We first have drop_overlaps_that_are_ok() drop any pre-existing + * 'overlap_ok' ranges, so that we can then reserve this memory + * range without risk of panic'ing on an overlapping overlap_ok + * early reservation. + */ +void __init reserve_early(u64 start, u64 end, char *name) +{ + if (start >= end) + return; + + __check_and_double_early_res(end); + + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 0); +} + +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +void __init free_early(u64 start, u64 end) +{ + struct early_res *r; + int i; + + i = find_overlapped_early(start, end); + r = &early_res[i]; + if (i >= max_early_res || r->end != end || r->start != start) + panic("free_early on not reserved area: %llx-%llx!", + start, end - 1); + + drop_range(i); +} + +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#define DEBUG_PRINT_EARLY_RES 1 + +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) + continue; + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; +#ifdef MAX_DMA32_PFN + if (max_pfn_mapped > MAX_DMA32_PFN) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + end = max_pfn_mapped << PAGE_SHIFT; + mem = find_e820_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else +void __init early_res_to_bootmem(u64 start, u64 end) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count - idx, max_early_res, start, end); + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, + r->start, r->end, r->name); + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) { + printk(KERN_CONT "\n"); + continue; + } + printk(KERN_CONT " ==> [%010llx - %010llx]\n", + final_start, final_end); + reserve_bootmem_generic(final_start, final_end - final_start, + BOOTMEM_DEFAULT); + } + /* clear them */ + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + early_res_count = 0; +} +#endif + +/* Check for already reserved areas */ +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) +{ + int i; + u64 addr = *addrp; + int changed = 0; + struct early_res *r; +again: + i = find_overlapped_early(addr, addr + size); + r = &early_res[i]; + if (i < max_early_res && r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; + } + return changed; +} + +/* Check for already reserved areas */ +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +{ + int i; + u64 addr = *addrp, last; + u64 size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < max_early_res && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr == -1ULL) + continue; + + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && + addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + continue; + return addr; + } + + return -1ULL; +} -- cgit v1.2.2 From 7da657d1f1dd27fa9d8289d5f7e53479c7fd3a95 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:25 -0800 Subject: x86: Add find_early_area_size Prepare to move bck find_e820_area_size back to e820.c. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-22-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/early_res.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 1cf2c2f9ea68..bfa1ba705d48 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -476,6 +476,29 @@ out: return -1ULL; } +u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, + u64 *sizep, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + goto out; + + return addr; + +out: + return -1ULL; +} + /* * Find a free area with specified alignment in a specific range. */ @@ -513,24 +536,20 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + u64 addr; + u64 ei_start, ei_last; if (ei->type != E820_RAM) continue; - addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) + ei_start = ei->addr; + addr = find_early_area_size(ei_start, ei_last, start, + sizep, align); + + if (addr == -1ULL) continue; + return addr; } -- cgit v1.2.2 From efdd0e81df0f23830c6d2cb971cf87f415b8dbdb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:26 -0800 Subject: x86: Move back find_e820_area to e820.c Makes early_res.c more clean, so later could move it to /kernel. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-23-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/e820.h | 2 ++ arch/x86/include/asm/early_res.h | 4 +-- arch/x86/kernel/e820.c | 53 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/early_res.c | 57 ---------------------------------------- 4 files changed, 57 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index efad699a2c22..a8299e134437 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -109,6 +109,8 @@ static inline void early_memtest(unsigned long start, unsigned long end) extern unsigned long end_user_pfn; +extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); +extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); #include diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h index 2d43b166782d..5a4d2eb8e79c 100644 --- a/arch/x86/include/asm/early_res.h +++ b/arch/x86/include/asm/early_res.h @@ -2,8 +2,6 @@ #define _ASM_X86_EARLY_RES_H #ifdef __KERNEL__ -extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); -extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); @@ -12,6 +10,8 @@ extern void early_res_to_bootmem(u64 start, u64 end); void reserve_early_without_check(u64 start, u64 end, char *name); u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, u64 size, u64 align); +u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start, + u64 *sizep, u64 align); #include int get_free_all_memory_range(struct range **rangep, int nodeid); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 82db4015604e..b4e512b03aa7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -722,6 +722,59 @@ static int __init e820_mark_nvs_memory(void) core_initcall(e820_mark_nvs_memory); #endif +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr != -1ULL) + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area_size(ei_start, ei_last, start, + sizep, align); + + if (addr != -1ULL) + return addr; + } + + return -1ULL; +} + /* * pre allocated 4k and reserved it in e820 */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index bfa1ba705d48..1b99a2619f9f 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -498,60 +498,3 @@ u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, out: return -1ULL; } - -/* - * Find a free area with specified alignment in a specific range. - */ -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area(ei_start, ei_last, start, end, - size, align); - - if (addr == -1ULL) - continue; - - return addr; - } - return -1ULL; -} - -/* - * Find next free range after *start - */ -u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area_size(ei_start, ei_last, start, - sizep, align); - - if (addr == -1ULL) - continue; - - return addr; - } - - return -1ULL; -} -- cgit v1.2.2 From 53db62a2529280ff216c941d8a2650204547e44a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:27 -0800 Subject: early_res: Enhance check_and_double_early_res ... to make it always try to start from low at first. This makes it less likely for early_memtest to reserve a bad range, in particular it puts new early_res in a range that is already tested. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-24-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/early_res.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 1b99a2619f9f..dbf08bd01252 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -180,9 +180,9 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } -static void __init __check_and_double_early_res(u64 start) +static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) { - u64 end, size, mem; + u64 start, end, size, mem; struct early_res *new; /* do we have enough slots left ? */ @@ -190,10 +190,23 @@ static void __init __check_and_double_early_res(u64 start) return; /* double it */ - end = max_pfn_mapped << PAGE_SHIFT; + mem = -1ULL; size = sizeof(struct early_res) * max_early_res * 2; - mem = find_e820_area(start, end, size, sizeof(struct early_res)); - + if (early_res == early_res_x) + start = 0; + else + start = early_res[0].end; + end = ex_start; + if (start + size < end) + mem = find_e820_area(start, end, size, + sizeof(struct early_res)); + if (mem == -1ULL) { + start = ex_end; + end = max_pfn_mapped << PAGE_SHIFT; + if (start + size < end) + mem = find_e820_area(start, end, size, + sizeof(struct early_res)); + } if (mem == -1ULL) panic("can not find more space for early_res array"); @@ -235,7 +248,7 @@ void __init reserve_early(u64 start, u64 end, char *name) if (start >= end) return; - __check_and_double_early_res(end); + __check_and_double_early_res(start, end); drop_overlaps_that_are_ok(start, end); __reserve_early(start, end, name, 0); @@ -248,7 +261,7 @@ void __init reserve_early_without_check(u64 start, u64 end, char *name) if (start >= end) return; - __check_and_double_early_res(end); + __check_and_double_early_res(start, end); r = &early_res[early_res_count]; -- cgit v1.2.2 From 59be5a8e8ce765cf739ec7f07176219972de7481 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:28 -0800 Subject: x86: Make 32bit support NO_BOOTMEM Let's make 32bit consistent with 64bit. -v2: Andrew pointed out for 32bit that we should use -1ULL Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-25-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 1 - arch/x86/kernel/early_res.c | 3 +++ arch/x86/mm/init_32.c | 6 ++++++ arch/x86/mm/numa_32.c | 3 +++ 4 files changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 95439843cebc..29f9efb74fc7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -571,7 +571,6 @@ config PARAVIRT_DEBUG config NO_BOOTMEM default y bool "Disable Bootmem code" - depends on X86_64 ---help--- Use early_res directly instead of bootmem before slab is ready. - allocator (buddy) [generic] diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index dbf08bd01252..656cdf86a2fa 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -354,6 +354,9 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid) /* need to go over early_node_map to find out good range for node */ nr_range = add_from_early_node_map(range, count, nr_range, nodeid); +#ifdef CONFIG_X86_32 + subtract_range(range, count, max_low_pfn, -1ULL); +#endif subtract_early_res(range, count); nr_range = clean_sort_range(range, count); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2dccde06d22f..262867a7d438 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -748,6 +748,7 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +#ifndef CONFIG_NO_BOOTMEM static unsigned long __init setup_node_bootmem(int nodeid, unsigned long start_pfn, unsigned long end_pfn, @@ -767,9 +768,11 @@ static unsigned long __init setup_node_bootmem(int nodeid, return bootmap + bootmap_size; } +#endif void __init setup_bootmem_allocator(void) { +#ifndef CONFIG_NO_BOOTMEM int nodeid; unsigned long bootmap_size, bootmap; /* @@ -781,11 +784,13 @@ void __init setup_bootmem_allocator(void) if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); +#endif printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<node_id = nid; +#ifndef CONFIG_NO_BOOTMEM NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; +#endif } setup_bootmem_allocator(); -- cgit v1.2.2 From 9b3be9f99203d9a400e8547f0e80f1d8f8e5738c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:29 -0800 Subject: Move round_up/down to kernel.h ... in preparation of moving early_res to kernel/. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-26-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/proto.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 4009f6534f52..6f414ed88620 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -23,14 +23,4 @@ extern int reboot_force; long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); -/* - * This looks more complex than it should be. But we need to - * get the type for the ~ right in round_down (it needs to be - * as wide as the result!), and we want to evaluate the macro - * arguments just once each. - */ -#define __round_mask(x,y) ((__typeof__(x))((y)-1)) -#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1) -#define round_down(x,y) ((x) & ~__round_mask(x,y)) - #endif /* _ASM_X86_PROTO_H */ -- cgit v1.2.2 From dd645cee7b50b61cb2d05b59eb6027679c437af6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:30 -0800 Subject: x86: Add find_fw_memmap_area ... so we can move early_res up. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-27-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/early_res.h | 1 + arch/x86/kernel/e820.c | 4 ++++ arch/x86/kernel/early_res.c | 17 +++++++++++------ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h index 5a4d2eb8e79c..9758f3df9dad 100644 --- a/arch/x86/include/asm/early_res.h +++ b/arch/x86/include/asm/early_res.h @@ -12,6 +12,7 @@ u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, u64 size, u64 align); u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start, u64 *sizep, u64 align); +u64 find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align); #include int get_free_all_memory_range(struct range **rangep, int nodeid); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b4e512b03aa7..36918d8463ab 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -748,6 +748,10 @@ u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) return -1ULL; } +u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) +{ + return find_e820_area(start, end, size, align); +} /* * Find next free range after *start */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 656cdf86a2fa..1458dc022343 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -7,16 +7,14 @@ #include #include -#include #include -#include /* * Early reserved memory areas. */ /* * need to make sure this one is bigger enough before - * find_e820_area could be used + * find_fw_memmap_area could be used */ #define MAX_EARLY_RES_X 32 @@ -180,6 +178,13 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } +u64 __init __weak find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) +{ + panic("should have find_fw_memmap_area defined with arch"); + + return -1ULL; +} + static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) { u64 start, end, size, mem; @@ -198,13 +203,13 @@ static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) start = early_res[0].end; end = ex_start; if (start + size < end) - mem = find_e820_area(start, end, size, + mem = find_fw_memmap_area(start, end, size, sizeof(struct early_res)); if (mem == -1ULL) { start = ex_end; end = max_pfn_mapped << PAGE_SHIFT; if (start + size < end) - mem = find_e820_area(start, end, size, + mem = find_fw_memmap_area(start, end, size, sizeof(struct early_res)); } if (mem == -1ULL) @@ -343,7 +348,7 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid) start = MAX_DMA32_PFN << PAGE_SHIFT; #endif end = max_pfn_mapped << PAGE_SHIFT; - mem = find_e820_area(start, end, size, sizeof(struct range)); + mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); if (mem == -1ULL) panic("can not find more space for range free"); -- cgit v1.2.2 From 0d1622d7f526311d87d7da2ee7dd14b73e45d3fc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 13 Feb 2010 10:33:12 +0200 Subject: x86-64, rwsem: Avoid store forwarding hazard in __downgrade_write The Intel Architecture Optimization Reference Manual states that a short load that follows a long store to the same object will suffer a store forwading penalty, particularly if the two accesses use different addresses. Trivially, a long load that follows a short store will also suffer a penalty. __downgrade_write() in rwsem incurs both penalties: the increment operation will not be able to reuse a recently-loaded rwsem value, and its result will not be reused by any recently-following rwsem operation. A comment in the code states that this is because 64-bit immediates are special and expensive; but while they are slightly special (only a single instruction allows them), they aren't expensive: a test shows that two loops, one loading a 32-bit immediate and one loading a 64-bit immediate, both take 1.5 cycles per iteration. Fix this by changing __downgrade_write to use the same add instruction on i386 and on x86_64, so that it uses the same operand size as all the other rwsem functions. Signed-off-by: Avi Kivity LKML-Reference: <1266049992-17419-1-git-send-email-avi@redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 10204a25bf93..606ede126972 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -232,34 +232,19 @@ static inline void __up_write(struct rw_semaphore *sem) */ static inline void __downgrade_write(struct rw_semaphore *sem) { -#ifdef CONFIG_X86_64 -# if RWSEM_WAITING_BIAS != -0x100000000 -# error "This code assumes RWSEM_WAITING_BIAS == -2^32" -# endif - - /* 64-bit immediates are special and expensive, and not needed here */ - asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX "incl 4(%1)\n\t" - /* transitions 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 */ - " jns 1f\n\t" - " call call_rwsem_downgrade_wake\n" - "1:\n\t" - "# ending __downgrade_write\n" - : "+m" (sem->count) - : "a" (sem) - : "memory", "cc"); -#else asm volatile("# beginning __downgrade_write\n\t" LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" - /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ + /* + * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) + * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) + */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" "1:\n\t" "# ending __downgrade_write\n" : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_WAITING_BIAS) + : "a" (sem), "er" (-RWSEM_WAITING_BIAS) : "memory", "cc"); -#endif } /* -- cgit v1.2.2 From 414bb144efa2d2fe16d104d836d0d6b6e9265788 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Dec 2009 13:08:41 +0100 Subject: x86, cpu: Print AMD virtualization features in /proc/cpuinfo This patch adds code to cpu initialization path to detect the extended virtualization features of AMD cpus to show them in /proc/cpuinfo. Signed-off-by: Joerg Roedel LKML-Reference: <1260792521-15212-1-git-send-email-joerg.roedel@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 4 ++++ arch/x86/kernel/cpu/addon_cpuid_features.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 637e1ec963c3..0cd82d068613 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -168,6 +168,10 @@ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 468489b57aae..97ad79cdf688 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, { 0, 0, 0, 0 } }; -- cgit v1.2.2 From 68fd111e02b979876359c7b471a8bcbca0628b75 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:25 -0800 Subject: x86, numa: Fix numa emulation calculation of big nodes numa=fake=N uses split_nodes_interleave() to partition the system into N fake nodes. Each node size must have be a multiple of FAKE_NODE_MIN_SIZE, otherwise it is possible to get strange alignments. Because of this, the remaining memory from each node when rounded to FAKE_NODE_MIN_SIZE is consolidated into a number of "big nodes" that are bigger than the rest. The calculation of the number of big nodes is incorrect since it is using a logical AND operator when it should be multiplying the rounded-off portion of each node with N. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 83bbc70d11bb..2ecbe0ca0dfc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -427,7 +427,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, * Calculate the number of big nodes that can be allocated as a result * of consolidating the remainder. */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / FAKE_NODE_MIN_SIZE; size &= FAKE_NODE_MIN_HASH_MASK; -- cgit v1.2.2 From 8df5bb34defd685fe86f60746bbf3d47d1c6f033 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:30 -0800 Subject: x86, numa: Add fixed node size option for numa emulation numa=fake=N specifies the number of fake nodes, N, to partition the system into and then allocates them by interleaving over physical nodes. This requires knowledge of the system capacity when attempting to allocate nodes of a certain size: either very large nodes to benchmark scalability of code that operates on individual nodes, or very small nodes to find bugs in the VM. This patch introduces numa=fake=[MG] so it is possible to specify the size of each node to allocate. When used, nodes of the size specified will be allocated and interleaved over the set of physical nodes. FAKE_NODE_MIN_SIZE was also moved to the more-appropriate include/asm/numa_64.h. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mmzone_64.h | 6 -- arch/x86/include/asm/numa_64.h | 5 ++ arch/x86/mm/numa_64.c | 117 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 114 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a29f48c2a322..288b96f815a6 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -#endif - #endif #endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index c4ae822e415f..823e070e7c26 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); + +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)64 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +#endif /* CONFIG_NUMA_EMU */ #else static inline void init_cpu_to_node(void) { } static inline void numa_set_node(int cpu, int node) { } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 2ecbe0ca0dfc..c47c78ba3aca 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -501,6 +501,102 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, return ret; } +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - e820_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. + */ +static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int ret = 0; + int i; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / + MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < MAX_NUMNODES; i++) + if (physnodes[i].start != physnodes[i].end) + node_set(i, physnode_mask); + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 end; + + end = find_end_of_node(physnodes[i].start, + physnodes[i].end, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (physnodes[i].end - end - + e820_hole_size(end, physnodes[i].end) < size) + end = physnodes[i].end; + + /* + * Setup the fake node that will be allocated as bootmem + * later. If setup_node_range() returns non-zero, there + * is no more memory available on this physical node. + */ + if (setup_node_range(ret++, &physnodes[i].start, + end - physnodes[i].start, + physnodes[i].end) < 0) + node_clear(i, physnode_mask); + } + } + return ret; +} + /* * Splits num_nodes nodes up equally starting at node_start. The return value * is the number of nodes split up and addr is adjusted to be at the end of the @@ -546,14 +642,7 @@ static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, if (i == num_nodes + node_start - 1) end = max_addr; else - while (end - *addr - e820_hole_size(*addr, end) < - size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } + end = find_end_of_node(*addr, max_addr, size); if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; } @@ -588,6 +677,18 @@ static int __init numa_emulation(unsigned long start_pfn, int num_phys_nodes; num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. + */ + if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + size = memparse(cmdline, &cmdline); + num_nodes = split_nodes_size_interleave(addr, max_addr, size); + if (num_nodes < 0) + return num_nodes; + goto out; + } + /* * If the numa=fake command-line is just a single number N, split the * system RAM into N fake nodes. -- cgit v1.2.2 From ca2107c9d6cf44fb915402d6f12b9d9ff3925cd7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:33 -0800 Subject: x86, numa: Remove configurable node size support for numa emulation Now that numa=fake=[MG] is implemented, it is possible to remove configurable node size support. The command-line parsing was already broken (numa=fake=*128, for example, would not work) and since fake nodes are now interleaved over physical nodes, this support is no longer required. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 160 ++++---------------------------------------------- 1 file changed, 13 insertions(+), 147 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c47c78ba3aca..3307ea8bd43a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -597,73 +597,6 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) return ret; } -/* - * Splits num_nodes nodes up equally starting at node_start. The return value - * is the number of nodes split up and addr is adjusted to be at the end of the - * last node allocated. - */ -static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, - int num_nodes) -{ - unsigned int big; - u64 size; - int i; - - if (num_nodes <= 0) - return -1; - if (num_nodes > MAX_NUMNODES) - num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / - num_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the leftovers. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / - FAKE_NODE_MIN_SIZE; - - /* Round down to nearest FAKE_NODE_MIN_SIZE. */ - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - printk(KERN_ERR "Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = node_start; i < num_nodes + node_start; i++) { - u64 end = *addr + size; - - if (i < big) - end += FAKE_NODE_MIN_SIZE; - /* - * The final node can have the remaining system RAM. Other - * nodes receive roughly the same amount of available pages. - */ - if (i == num_nodes + node_start - 1) - end = max_addr; - else - end = find_end_of_node(*addr, max_addr, size); - if (setup_node_range(i, addr, end - *addr, max_addr) < 0) - break; - } - return i - node_start + 1; -} - -/* - * Splits the remaining system RAM into chunks of size. The remaining memory is - * always assigned to a final node and can be asymmetric. Returns the number of - * nodes split. - */ -static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, - u64 size) -{ - int i = node_start; - size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, addr, size, max_addr)) - ; - return i - node_start; -} - /* * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. @@ -671,99 +604,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn, int acpi, int k8) { - u64 size, addr = start_pfn << PAGE_SHIFT; + u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; - int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; int num_phys_nodes; + int num_nodes; + int i; num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); /* * If the numa=fake command-line contains a 'M' or 'G', it represents - * the fixed node size. + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. */ if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + u64 size; + size = memparse(cmdline, &cmdline); num_nodes = split_nodes_size_interleave(addr, max_addr, size); - if (num_nodes < 0) - return num_nodes; - goto out; - } + } else { + unsigned long n; - /* - * If the numa=fake command-line is just a single number N, split the - * system RAM into N fake nodes. - */ - if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - long n = simple_strtol(cmdline, NULL, 0); - - num_nodes = split_nodes_interleave(addr, max_addr, - num_phys_nodes, n); - if (num_nodes < 0) - return num_nodes; - goto out; + n = simple_strtoul(cmdline, NULL, 0); + num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); } - /* Parse the command line. */ - for (coeff_flag = 0; ; cmdline++) { - if (*cmdline && isdigit(*cmdline)) { - num = num * 10 + *cmdline - '0'; - continue; - } - if (*cmdline == '*') { - if (num > 0) - coeff = num; - coeff_flag = 1; - } - if (!*cmdline || *cmdline == ',') { - if (!coeff_flag) - coeff = 1; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - * Command-line coefficients are in megabytes. - */ - size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) - for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, &addr, - size, max_addr) < 0) - goto done; - if (!*cmdline) - break; - coeff_flag = 0; - coeff = -1; - } - num = 0; - } -done: - if (!num_nodes) - return -1; - /* Fill remainder of system RAM, if appropriate. */ - if (addr < max_addr) { - if (coeff_flag && coeff < 0) { - /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(&addr, max_addr, - num_nodes, num); - goto out; - } - switch (*(cmdline - 1)) { - case '*': - /* Split remaining nodes into coeff chunks */ - if (coeff <= 0) - break; - num_nodes += split_nodes_equally(&addr, max_addr, - num_nodes, coeff); - break; - case ',': - /* Do not allocate remaining system RAM */ - break; - default: - /* Give one final node */ - setup_node_range(num_nodes, &addr, max_addr - addr, - max_addr); - num_nodes++; - } - } -out: + if (num_nodes < 0) + return num_nodes; memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); if (memnode_shift < 0) { memnode_shift = 0; -- cgit v1.2.2 From 942fa3b63eb525aa0512ba28c42e656d8efc6787 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 8 Feb 2010 10:03:17 +0000 Subject: x86, mtrr: Kill over the top warn Fixes bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=12558 Fixes bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=12317 (and if this really needed to be a warn you'd be responding to the bugs left in bugzilla from it...) Signed-off-by: Alan Cox LKML-Reference: <20100208100239.2568.2940.stgit@localhost.localdomain> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 4d755846fee6..163e59e272d5 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -464,7 +464,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); mask_lo = tmp; } } -- cgit v1.2.2 From 97c169d39b6846a564dc8d883832e7fef9bdb77d Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 16 Feb 2010 03:30:06 -0500 Subject: ACPI: remove Asus P2B-DS from acpi=ht blacklist We realized when we broke acpi=ht http://bugzilla.kernel.org/show_bug.cgi?id=14886 that acpi=ht is not needed on this box and folks have been using acpi=force on it anyway. Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdfa5ca4..af1c5833ff23 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1342,14 +1342,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), }, }, - { - .callback = force_acpi_ht, - .ident = "ASUS P2B-DS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), - }, - }, { .callback = force_acpi_ht, .ident = "ASUS CUR-DLS", -- cgit v1.2.2 From 17c0e7107bed3d578864e6519f7f4e4c324c8f58 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 08:29:25 -0500 Subject: x86: Mark atomic irq ops raw for 32bit legacy The atomic ops emulation for 32bit legacy CPUs floods the tracer with irq off/on entries. The irq disabled regions are short and therefor not interesting when chasing long irq disabled latencies. Mark them raw and keep them out of the trace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/atomic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 8baaa719fa7f..8f8217b9bdac 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -187,10 +187,10 @@ static inline int atomic_add_return(int i, atomic_t *v) #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); + raw_local_irq_save(flags); __i = atomic_read(v); atomic_set(v, i + __i); - local_irq_restore(flags); + raw_local_irq_restore(flags); return i + __i; #endif } -- cgit v1.2.2 From dade7716925a4e9a31f249f9ca1ed4e2f1495a8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 18:39:36 +0200 Subject: x86: Convert ioapic_lock and vector_lock to raw_spinlock Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 106 ++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 937150e4c06d..d55e43d352b3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -73,8 +73,8 @@ */ int sis_apic_bug = -1; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -393,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) struct irq_pin_list *entry; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; @@ -402,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return false; } @@ -420,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return eu.entry; } @@ -446,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -461,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin) unsigned long flags; union entry_union eu = { .entry.mask = 1 }; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -591,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) BUG_ON(!cfg); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __mask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) @@ -601,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) struct irq_cfg *cfg = desc->chip_data; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void mask_IO_APIC_irq(unsigned int irq) @@ -1127,12 +1127,12 @@ void lock_vector_lock(void) /* Used to the online set of cpus does not change * during assign_irq_vector. */ - spin_lock(&vector_lock); + raw_spin_lock(&vector_lock); } void unlock_vector_lock(void) { - spin_unlock(&vector_lock); + raw_spin_unlock(&vector_lock); } static int @@ -1220,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int err; unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); err = __assign_irq_vector(irq, cfg, mask); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } @@ -1265,7 +1265,7 @@ void __setup_vector_irq(int cpu) * assignments that might be happening on another cpu in parallel, * while we setup our initial vector to irq mappings. */ - spin_lock(&vector_lock); + raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; @@ -1284,7 +1284,7 @@ void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } - spin_unlock(&vector_lock); + raw_spin_unlock(&vector_lock); } static struct irq_chip ioapic_chip; @@ -1603,14 +1603,14 @@ __apicdebuginit(void) print_IO_APIC(void) for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); if (reg_01.bits.version >= 0x20) reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); @@ -1905,9 +1905,9 @@ void __init enable_IO_APIC(void) * The number of IO-APIC IRQ registers (== #pins): */ for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } @@ -2047,9 +2047,9 @@ void __init setup_ioapic_ids_from_mpc(void) for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); old_id = mp_ioapics[apic_id].apicid; @@ -2108,16 +2108,16 @@ void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid); reg_00.bits.ID = mp_ioapics[apic_id].apicid; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) printk("could not set ID!\n"); else @@ -2200,7 +2200,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) unsigned long flags; struct irq_cfg *cfg; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) @@ -2208,7 +2208,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) } cfg = irq_cfg(irq); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } @@ -2219,9 +2219,9 @@ static int ioapic_retrigger_irq(unsigned int irq) struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; } @@ -2314,14 +2314,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); ret = set_desc_affinity(desc, mask, &dest); if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -2549,9 +2549,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ack_apic_level(unsigned int irq) @@ -3133,13 +3133,13 @@ static int ioapic_resume(struct sys_device *dev) data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { reg_00.bits.ID = mp_ioapics[dev->id].apicid; io_apic_write(dev->id, 0, reg_00.raw); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ioapic_write_entry(dev->id, i, entry[i]); @@ -3202,7 +3202,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (irq_want < nr_irqs_gsi) irq_want = nr_irqs_gsi; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { @@ -3221,7 +3221,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) irq = new; break; } - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); if (irq > 0) { dynamic_irq_init(irq); @@ -3261,9 +3261,9 @@ void destroy_irq(unsigned int irq) desc->chip_data = cfg; free_irte(irq); - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); } /* @@ -3800,9 +3800,9 @@ int __init io_apic_get_redir_entries (int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.entries; } @@ -3964,9 +3964,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (physids_empty(apic_id_map)) apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (apic_id >= get_physical_broadcast()) { printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " @@ -4000,10 +4000,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0, reg_00.raw); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* Sanity check */ if (reg_00.bits.ID != apic_id) { @@ -4024,9 +4024,9 @@ int __init io_apic_get_version(int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } -- cgit v1.2.2 From 1252f238db48ec419f40c1bdf30fda649860eed9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 16 Feb 2010 15:02:13 +0100 Subject: x86: set_personality_ia32() misses force_personality32 05d43ed8a "x86: get rid of the insane TIF_ABI_PENDING bit" forgot about force_personality32. Fix. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- arch/x86/kernel/process_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 41a26a82470a..126f0b493d04 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -527,6 +527,7 @@ void set_personality_ia32(void) /* Make sure to be in 32bit mode */ set_thread_flag(TIF_IA32); + current->personality |= force_personality32; /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; -- cgit v1.2.2 From 11557b24fdec13cb1c3d5f681688401a651ed54e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 16 Feb 2010 15:24:01 +0100 Subject: x86: ELF_PLAT_INIT() shouldn't worry about TIF_IA32 The 64-bit version of ELF_PLAT_INIT() clears TIF_IA32, but at this point it has already been cleared by SET_PERSONALITY == set_personality_64bit. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- arch/x86/include/asm/elf.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1994d3f58443..f2ad2163109d 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -170,10 +170,7 @@ static inline void elf_common_init(struct thread_struct *t, } #define ELF_PLAT_INIT(_r, load_addr) \ -do { \ - elf_common_init(¤t->thread, _r, 0); \ - clear_thread_flag(TIF_IA32); \ -} while (0) + elf_common_init(¤t->thread, _r, 0) #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -- cgit v1.2.2 From 40d6753e78a602bdf62e7741c0caa36474882f00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 18:33:11 +0200 Subject: x86: Convert set_atomicity_lock to raw_spinlock Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/generic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 163e59e272d5..9aa5dc76ff4a 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -570,7 +570,7 @@ static unsigned long set_mtrr_state(void) static unsigned long cr4; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts, @@ -590,7 +590,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) * changes to the way the kernel boots */ - spin_lock(&set_atomicity_lock); + raw_spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; @@ -627,7 +627,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(cr4); - spin_unlock(&set_atomicity_lock); + raw_spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) -- cgit v1.2.2 From 0fdc7a8022c3eaff6b5ee27ffb9e913e5e58d8e9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 16:49:55 +0200 Subject: x86: Convert nmi_lock to raw_spinlock nmi_lock must be a spinning spinlock in -rt. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/nmi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396cb..24e7742d633a 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -416,13 +416,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) /* We can be called before check_nmi_watchdog, hence NULL check. */ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - spin_lock(&lock); + raw_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); dump_stack(); - spin_unlock(&lock); + raw_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); rc = 1; -- cgit v1.2.2 From 5619c28061ff9d2559a93eaba492935530f2a513 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 18:35:11 +0200 Subject: x86: Convert i8259_lock to raw_spinlock Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/i8259.h | 2 +- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/i8259.c | 30 +++++++++++++++--------------- arch/x86/kernel/time.c | 4 ++-- arch/x86/kernel/visws_quirks.c | 6 +++--- 5 files changed, 23 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 58d7091eeb1f..7ec65b18085d 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -24,7 +24,7 @@ extern unsigned int cached_irq_mask; #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; extern void init_8259A(int auto_eoi); extern void enable_8259A_irq(unsigned int irq); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c86591b906fa..f5e40339622b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1830,7 +1830,7 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "\nprinting PIC contents\n"); - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); v = inb(0xa1) << 8 | inb(0x21); printk(KERN_DEBUG "... PIC IMR: %04x\n", v); @@ -1844,7 +1844,7 @@ __apicdebuginit(void) print_PIC(void) outb(0x0a,0xa0); outb(0x0a,0x20); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); printk(KERN_DEBUG "... PIC ISR: %04x\n", v); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102bef80..8c93a84bb627 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,7 +32,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); struct irq_chip i8259A_chip = { @@ -68,13 +68,13 @@ void disable_8259A_irq(unsigned int irq) unsigned int mask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void enable_8259A_irq(unsigned int irq) @@ -82,13 +82,13 @@ void enable_8259A_irq(unsigned int irq) unsigned int mask = ~(1 << irq); unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } int i8259A_irq_pending(unsigned int irq) @@ -97,12 +97,12 @@ int i8259A_irq_pending(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) ret = inb(PIC_MASTER_CMD) & mask; else ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } @@ -150,7 +150,7 @@ static void mask_and_ack_8259A(unsigned int irq) unsigned int irqmask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign @@ -183,7 +183,7 @@ handle_real_irq: outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return; spurious_8259A_irq: @@ -285,24 +285,24 @@ void mask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void unmask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void init_8259A(int auto_eoi) @@ -311,7 +311,7 @@ void init_8259A(int auto_eoi) i8259A_auto_eoi = auto_eoi; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ @@ -356,5 +356,5 @@ void init_8259A(int auto_eoi) outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index be2573448ed9..fb5cc5e14cfa 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + raw_spin_lock(&i8259A_lock); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + raw_spin_unlock(&i8259A_lock); } global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 34a279a7471d..ab38ce0984fa 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -559,7 +559,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) struct irq_desc *desc; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* Find out what's interrupting in the PIIX4 master 8259 */ outb(0x0c, 0x20); /* OCW3 Poll command */ @@ -596,7 +596,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) outb(0x60 + realirq, 0x20); } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); desc = irq_to_desc(realirq); @@ -614,7 +614,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) return IRQ_HANDLED; out_unlock: - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return IRQ_NONE; } -- cgit v1.2.2 From c13f3d378f77ce3176628ade452b0e461242faf3 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 15 Feb 2010 11:33:04 +0900 Subject: x86/gart: Unexport gart_iommu_aperture I wrongly exported gart_iommu_aperture in the commit 42590a75019a50012f25a962246498dead428433. It's not necessary so let's unexport it. Signed-off-by: FUJITA Tomonori Cc: Joerg Roedel LKML-Reference: <20100215113241P.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index f147a95fd84a..3704997e8b25 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -31,7 +31,6 @@ #include int gart_iommu_aperture; -EXPORT_SYMBOL_GPL(gart_iommu_aperture); int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; -- cgit v1.2.2 From 477346ff74f4c2aed50e8a0db96a61069f3e5b80 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 7 Jan 2010 17:04:54 +1000 Subject: x86-64: Allow fbdev primary video code For some reason the 64-bit tree was doing this differently and I can't see why it would need to. This correct behaviour when you have two GPUs plugged in and 32-bit put the console in one place and 64-bit in another. Signed-off-by: Dave Airlie LKML-Reference: <1262847894-27498-1-git-send-email-airlied@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 2 -- arch/x86/include/asm/fb.h | 4 ---- 2 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 78b32be55e9e..0a43dc515e4c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -135,9 +135,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ # suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ -ifeq ($(CONFIG_X86_32),y) drivers-$(CONFIG_FB) += arch/x86/video/ -endif #### # boot loader support. Several targets are kept for legacy purposes diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index 53018464aea6..2519d0679d99 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; } -#ifdef CONFIG_X86_32 extern int fb_is_primary_device(struct fb_info *info); -#else -static inline int fb_is_primary_device(struct fb_info *info) { return 0; } -#endif #endif /* _ASM_X86_FB_H */ -- cgit v1.2.2 From 580e0ad21d6d6f932461d24b47041e3dd499c23f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 16 Feb 2010 18:40:35 -0800 Subject: core: Move early_res from arch/x86 to kernel/ This makes the range reservation feature available to other architectures. -v2: add get_max_mapped, max_pfn_mapped only defined in x86... to fix PPC compiling -v3: according to hpa, add CONFIG_HAVE_EARLY_RES -v4: fix typo about EARLY_RES in config Signed-off-by: Yinghai Lu LKML-Reference: <4B7B5723.4070009@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 3 + arch/x86/include/asm/e820.h | 2 +- arch/x86/include/asm/early_res.h | 21 -- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 10 +- arch/x86/kernel/early_res.c | 521 --------------------------------------- 6 files changed, 14 insertions(+), 545 deletions(-) delete mode 100644 arch/x86/include/asm/early_res.h delete mode 100644 arch/x86/kernel/early_res.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 29f9efb74fc7..0e9f8b10de52 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -183,6 +183,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_EARLY_RES + def_bool y + config HAVE_INTEL_TXT def_bool y depends on EXPERIMENTAL && DMAR && ACPI diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index a8299e134437..0e22296790d3 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -112,7 +112,7 @@ extern unsigned long end_user_pfn; extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); -#include +#include extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h deleted file mode 100644 index 9758f3df9dad..000000000000 --- a/arch/x86/include/asm/early_res.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _ASM_X86_EARLY_RES_H -#define _ASM_X86_EARLY_RES_H -#ifdef __KERNEL__ - -extern void reserve_early(u64 start, u64 end, char *name); -extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); -extern void free_early(u64 start, u64 end); -extern void early_res_to_bootmem(u64 start, u64 end); - -void reserve_early_without_check(u64 start, u64 end, char *name); -u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align); -u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start, - u64 *sizep, u64 align); -u64 find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align); -#include -int get_free_all_memory_range(struct range **rangep, int nodeid); - -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_EARLY_RES_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f5fb9f0b6277..d87f09bc5a52 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -38,7 +38,7 @@ obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o -obj-y += bootflag.o e820.o early_res.o +obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 36918d8463ab..740b440fbd73 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -752,6 +751,15 @@ u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) { return find_e820_area(start, end, size, align); } + +u64 __init get_max_mapped(void) +{ + u64 end = max_pfn_mapped; + + end <<= PAGE_SHIFT; + + return end; +} /* * Find next free range after *start */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c deleted file mode 100644 index 1458dc022343..000000000000 --- a/arch/x86/kernel/early_res.c +++ /dev/null @@ -1,521 +0,0 @@ -/* - * early_res, could be used to replace bootmem - */ -#include -#include -#include -#include -#include - -#include - -/* - * Early reserved memory areas. - */ -/* - * need to make sure this one is bigger enough before - * find_fw_memmap_area could be used - */ -#define MAX_EARLY_RES_X 32 - -struct early_res { - u64 start, end; - char name[15]; - char overlap_ok; -}; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; - -static int max_early_res __initdata = MAX_EARLY_RES_X; -static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < max_early_res && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; - early_res_count--; -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. - */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) -{ - int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[15]; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) - continue; - - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= max_early_res) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name ? name : "", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -u64 __init __weak find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) -{ - panic("should have find_fw_memmap_area defined with arch"); - - return -1ULL; -} - -static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) -{ - u64 start, end, size, mem; - struct early_res *new; - - /* do we have enough slots left ? */ - if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) - return; - - /* double it */ - mem = -1ULL; - size = sizeof(struct early_res) * max_early_res * 2; - if (early_res == early_res_x) - start = 0; - else - start = early_res[0].end; - end = ex_start; - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - if (mem == -1ULL) { - start = ex_end; - end = max_pfn_mapped << PAGE_SHIFT; - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - } - if (mem == -1ULL) - panic("can not find more space for early_res array"); - - new = __va(mem); - /* save the first one for own */ - new[0].start = mem; - new[0].end = mem + size; - new[0].overlap_ok = 0; - /* copy old to new */ - if (early_res == early_res_x) { - memcpy(&new[1], &early_res[0], - sizeof(struct early_res) * max_early_res); - memset(&new[max_early_res+1], 0, - sizeof(struct early_res) * (max_early_res - 1)); - early_res_count++; - } else { - memcpy(&new[1], &early_res[1], - sizeof(struct early_res) * (max_early_res - 1)); - memset(&new[max_early_res], 0, - sizeof(struct early_res) * max_early_res); - } - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = new; - max_early_res *= 2; - printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", - max_early_res, mem, mem + size - 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init reserve_early_without_check(u64 start, u64 end, char *name) -{ - struct early_res *r; - - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - r = &early_res[early_res_count]; - - r->start = start; - r->end = end; - r->overlap_ok = 0; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= max_early_res || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -#ifdef CONFIG_NO_BOOTMEM -static void __init subtract_early_res(struct range *range, int az) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - -#define DEBUG_PRINT_EARLY_RES 1 - -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO "Subtract (%d early reservations)\n", count); -#endif - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, - r->start, r->end, r->name); -#endif - final_start = PFN_DOWN(r->start); - final_end = PFN_UP(r->end); - if (final_start >= final_end) - continue; - subtract_range(range, az, final_start, final_end); - } - -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - int i, count; - u64 start = 0, end; - u64 size; - u64 mem; - struct range *range; - int nr_range; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - count *= 2; - - size = sizeof(struct range) * count; -#ifdef MAX_DMA32_PFN - if (max_pfn_mapped > MAX_DMA32_PFN) - start = MAX_DMA32_PFN << PAGE_SHIFT; -#endif - end = max_pfn_mapped << PAGE_SHIFT; - mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); - if (mem == -1ULL) - panic("can not find more space for range free"); - - range = __va(mem); - /* use early_node_map[] and early_res to get range array at first */ - memset(range, 0, size); - nr_range = 0; - - /* need to go over early_node_map to find out good range for node */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); -#ifdef CONFIG_X86_32 - subtract_range(range, count, max_low_pfn, -1ULL); -#endif - subtract_early_res(range, count); - nr_range = clean_sort_range(range, count); - - /* need to clear it ? */ - if (nodeid == MAX_NUMNODES) { - memset(&early_res[0], 0, - sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - } - - *rangep = range; - return nr_range; -} -#else -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - - printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count - idx, max_early_res, start, end); - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } - /* clear them */ - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - early_res_count = 0; -} -#endif - -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < max_early_res && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) -{ - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < max_early_res && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - * only with the area.between start to end is active range from early_node_map - * so they are good as RAM - */ -u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - goto out; - if (last > end) - goto out; - - return addr; - -out: - return -1ULL; -} - -u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, - u64 *sizep, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - goto out; - - return addr; - -out: - return -1ULL; -} -- cgit v1.2.2 From 0a832320f1bae6a4169bf683e201378f2437cfc1 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Tue, 16 Feb 2010 15:17:29 -0800 Subject: x86: Add iMac9,1 to pci_reboot_dmi_table On the iMac9,1 /sbin/reboot results in a black mangled screen. Adding this DMI entry gets the machine to reboot cleanly as it should. Signed-off-by: Justin P. Mattock LKML-Reference: <1266362249-3337-1-git-send-email-justinmattock@gmail.com> Signed-off-by: H. Peter Anvin Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 704bddcdf64d..8e1aac86b50c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -461,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), }, }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, { } }; -- cgit v1.2.2 From e7b8e675d9c71b868b66f62f725a948047514719 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 Jan 2010 04:40:03 -0500 Subject: tracing: Unify arch_syscall_addr() implementations Most implementations of arch_syscall_addr() are the same, so create a default version in common code and move the one piece that differs (the syscall table) to asm/syscall.h. New arch ports don't have to waste time copying & pasting this simple function. The s390/sparc versions need to be different, so document why. Signed-off-by: Mike Frysinger Acked-by: David S. Miller Acked-by: Paul Mundt Acked-by: Heiko Carstens Cc: Steven Rostedt LKML-Reference: <1264498803-17278-1-git-send-email-vapier@gentoo.org> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/syscall.h | 2 ++ arch/x86/kernel/ftrace.c | 10 ---------- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 8d33bc5462d1..c4a348f7bd43 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,6 +16,8 @@ #include #include +extern const unsigned long sys_call_table[]; + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 309689245431..0d93a941934c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -484,13 +484,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned long *sys_call_table; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)(&sys_call_table)[nr]; -} -#endif -- cgit v1.2.2 From f850c30c8b426ba1688cb63b1a3e534eed03a138 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 10 Feb 2010 17:25:17 +0100 Subject: tracing/kprobes: Make Kconfig dependencies generic KPROBES_EVENT actually depends on the regs and stack access API (b1cf540f) and not on x86. So introduce a new config option which architectures can select if they have the API implemented and switch x86. Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Martin Schwidefsky LKML-Reference: <20100210162517.GB6933@osiris.boeblingen.de.ibm.com> Signed-off-by: Frederic Weisbecker --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 55298e891571..07baa12929b4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -45,6 +45,7 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select USER_STACKTRACE_SUPPORT + select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_DMA_API_DEBUG select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 -- cgit v1.2.2 From 39c662f60c556908faf861ef0430549b1731b891 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 19:15:48 +0200 Subject: x86: Convert tlbstate_lock to raw_spinlock Signed-off-by: Thomas Gleixner --- arch/x86/mm/tlb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 65b58e4b0b8b..426f3a1a64d3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -41,7 +41,7 @@ union smp_flush_state { struct { struct mm_struct *flush_mm; unsigned long flush_va; - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; char pad[INTERNODE_CACHE_BYTES]; @@ -181,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is * probably not worth checking this for a cache-hot lock. */ - spin_lock(&f->tlbstate_lock); + raw_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; f->flush_va = va; @@ -199,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = NULL; f->flush_va = 0; - spin_unlock(&f->tlbstate_lock); + raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, @@ -223,7 +223,7 @@ static int __cpuinit init_smp_flush(void) int i; for (i = 0; i < ARRAY_SIZE(flush_state); i++) - spin_lock_init(&flush_state[i].tlbstate_lock); + raw_spin_lock_init(&flush_state[i].tlbstate_lock); return 0; } -- cgit v1.2.2 From 81fc03909a80bead8f553287a2b749a1d29dca64 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Feb 2010 11:16:24 +0000 Subject: kmemcheck: Test the full object in kmemcheck_is_obj_initialized() This is a fix for bug #14845 (bugzilla.kernel.org). The update_checksum() function in mm/kmemleak.c calls kmemcheck_is_obj_initialised() before scanning an object. When KMEMCHECK_PARTIAL_OK is enabled, this function returns true. However, the crc32_le() reads smaller intervals (32-bit) for which kmemleak_is_obj_initialised() may be false leading to a kmemcheck warning. Note that kmemcheck_is_obj_initialized() is currently only used by kmemleak before scanning a memory location. Signed-off-by: Catalin Marinas Cc: Andrew Morton Cc: Christian Casteyde Cc: Vegard Nossum Signed-off-by: Pekka Enberg --- arch/x86/mm/kmemcheck/kmemcheck.c | 2 +- arch/x86/mm/kmemcheck/shadow.c | 16 ++++++++++++++-- arch/x86/mm/kmemcheck/shadow.h | 2 ++ 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 8cc183344140..b3b531a4f8e5 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) if (!shadow) return true; - status = kmemcheck_shadow_test(shadow, size); + status = kmemcheck_shadow_test_all(shadow, size); return status == KMEMCHECK_SHADOW_INITIALIZED; } diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c index 3f66b82076a3..aec124214d97 100644 --- a/arch/x86/mm/kmemcheck/shadow.c +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) { +#ifdef CONFIG_KMEMCHECK_PARTIAL_OK uint8_t *x; unsigned int i; x = shadow; -#ifdef CONFIG_KMEMCHECK_PARTIAL_OK /* * Make sure _some_ bytes are initialized. Gcc frequently generates * code to access neighboring bytes. @@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) return x[i]; } + + return x[0]; #else + return kmemcheck_shadow_test_all(shadow, size); +#endif +} + +enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + /* All bytes must be initialized. */ for (i = 0; i < size; ++i) { if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) return x[i]; } -#endif return x[0]; } diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h index af46d9ab9d86..ff0b2f70fbcb 100644 --- a/arch/x86/mm/kmemcheck/shadow.h +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -11,6 +11,8 @@ enum kmemcheck_shadow { void *kmemcheck_shadow_lookup(unsigned long address); enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); +enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, + unsigned int size); void kmemcheck_shadow_set(void *shadow, unsigned int size); #endif -- cgit v1.2.2 From 6738762d73a237ec322b04d8b9d55c8fd5d84713 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:36 -0800 Subject: x86, irq: Remove arch_probe_nr_irqs So keep nr_irqs == NR_IRQS. With radix trees is matters less. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-33-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f5e40339622b..c64ddd9d9979 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3826,28 +3826,6 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } -#ifdef CONFIG_SPARSE_IRQ -int __init arch_probe_nr_irqs(void) -{ - int nr; - - if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) - nr_irqs = NR_VECTORS * nr_cpu_ids; - - nr = nr_irqs_gsi + 8 * nr_cpu_ids; -#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) - /* - * for MSI and HT dyn irq - */ - nr += nr_irqs_gsi * 16; -#endif - if (nr < nr_irqs) - nr_irqs = nr; - - return 0; -} -#endif - static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { -- cgit v1.2.2 From 2b633e3fac5efada088b57d31e65401f22bcc18f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:37 -0800 Subject: smp: Use nr_cpus= to set nr_cpu_ids early On x86, before prefill_possible_map(), nr_cpu_ids will be NR_CPUS aka CONFIG_NR_CPUS. Add nr_cpus= to set nr_cpu_ids. so we can simulate cpus <=8 are installed on normal config. -v2: accordging to Christoph, acpi_numa_init should use nr_cpu_ids in stead of NR_CPUS. -v3: add doc in kernel-parameters.txt according to Andrew. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-34-git-send-email-yinghai@kernel.org> Acked-by: Linus Torvalds Signed-off-by: H. Peter Anvin Cc: Tony Luck --- arch/x86/kernel/smpboot.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..eff2fe175422 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1213,11 +1213,12 @@ __init void prefill_possible_map(void) total_cpus = max_t(int, possible, num_processors + disabled_cpus); - if (possible > CONFIG_NR_CPUS) { + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { printk(KERN_WARNING "%d Processors exceeds NR_CPUS limit of %d\n", - possible, CONFIG_NR_CPUS); - possible = CONFIG_NR_CPUS; + possible, nr_cpu_ids); + possible = nr_cpu_ids; } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", -- cgit v1.2.2 From f1f6baf8f1df29be38003089787e378567ce0086 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 17 Feb 2010 18:32:06 -0800 Subject: x86, setup: When restoring the screen, update boot_params.screen_info When we restore the screen content after a mode change, we return the cursor to its former position. However, we need to also update boot_params.screen_info accordingly, so that the decompression code knows where on the screen the cursor is. Just in case the video BIOS does something extra screwy, read the cursor position back from the BIOS instead of relying on it doing the right thing. While we're at it, make sure we cap the cursor position to the new screen coordinates. Reported-by: Wim Osterholt Bugzilla-Reference: http://bugzilla.kernel.org/show_bug.cgi?id=15329 Signed-off-by: H. Peter Anvin --- arch/x86/boot/video.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index f767164cd5df..43eda284d27f 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -298,11 +298,18 @@ static void restore_screen(void) } /* Restore cursor position */ + if (saved.curx >= xs) + saved.curx = xs-1; + if (saved.cury >= ys) + saved.cury = ys-1; + initregs(&ireg); ireg.ah = 0x02; /* Set cursor position */ ireg.dh = saved.cury; ireg.dl = saved.curx; intcall(0x10, &ireg, NULL); + + store_cursor_position(); } void set_video(void) -- cgit v1.2.2 From eb5b3794062824ba12d883901eea49ea89d0a678 Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Sun, 7 Feb 2010 13:02:50 -0800 Subject: x86, irq: Keep chip_data in create_irq_nr and destroy_irq Version 4: use get_irq_chip_data() in destroy_irq() to get rid of some local vars. When two drivers are setting up MSI-X at the same time via pci_enable_msix() there is a race. See this dmesg excerpt: [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X [ 85.170611] alloc irq_desc for 99 on node -1 [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X [ 85.170614] alloc kstat_irqs on node -1 [ 85.170616] alloc irq_2_iommu on node -1 [ 85.170617] alloc irq_desc for 100 on node -1 [ 85.170619] alloc kstat_irqs on node -1 [ 85.170621] alloc irq_2_iommu on node -1 [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X [ 85.170626] alloc irq_desc for 101 on node -1 [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X [ 85.170630] alloc kstat_irqs on node -1 [ 85.170631] alloc irq_2_iommu on node -1 [ 85.170635] alloc irq_desc for 102 on node -1 [ 85.170636] alloc kstat_irqs on node -1 [ 85.170639] alloc irq_2_iommu on node -1 [ 85.170646] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 As you can see igb and ixgbe are both alternating on create_irq_nr() via pci_enable_msix() in their probe function. ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data = NULL via dynamic_irq_init(). igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[] via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this: cfg_new = irq_desc_ptrs[102]->chip_data; if (cfg_new->vector != 0) continue; This hits the NULL deref. Another possible race exists via pci_disable_msix() in a driver or in the number of error paths that call free_msi_irqs(): destroy_irq() dynamic_irq_cleanup() which sets desc->chip_data = NULL ...race window... desc->chip_data = cfg; Remove the save and restore code for cfg in create_irq_nr() and destroy_irq() and take the desc->lock when checking the irq_cfg. Reported-and-analyzed-by: Brandon Philips Signed-off-by: Yinghai Lu LKML-Reference: <20100207210250.GB8256@jenkins.home.ifup.org> Signed-off-by: Brandon Phiilps Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5e4cce254e43..e93a76bc8670 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3278,12 +3278,9 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) { - dynamic_irq_init(irq); - /* restore it, in case dynamic_irq_init clear it */ - if (desc_new) - desc_new->chip_data = cfg_new; - } + if (irq > 0) + dynamic_irq_init_keep_chip_data(irq); + return irq; } @@ -3305,19 +3302,12 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; - struct irq_cfg *cfg; - struct irq_desc *desc; - /* store it, in case dynamic_irq_cleanup clear it */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - dynamic_irq_cleanup(irq); - /* connect back irq_cfg */ - desc->chip_data = cfg; + dynamic_irq_cleanup_keep_chip_data(irq); free_irte(irq); spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq, cfg); + __clear_irq_vector(irq, get_irq_chip_data(irq)); spin_unlock_irqrestore(&vector_lock, flags); } -- cgit v1.2.2 From f619b3d8427eb57f0134dab75b0d217325c72411 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Feb 2010 12:09:07 +0100 Subject: x86, cacheinfo: Remove NUMA dependency, fix for AMD Fam10h rev D1 The show/store_cache_disable routines depend unnecessarily on NUMA's cpu_to_node and the disabling of cache indices broke when !CONFIG_NUMA. Remove that dependency by using a helper which is always correct. While at it, enable L3 Cache Index disable on rev D1 Istanbuls which sport the feature too. Signed-off-by: Borislav Petkov LKML-Reference: <20100218184339.GG20473@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 589b705e80ed..be5f5c28ddfb 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -327,7 +327,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) /* see errata #382 and #388 */ if ((boot_cpu_data.x86 == 0x10) && - ((boot_cpu_data.x86_model < 0x9) || + ((boot_cpu_data.x86_model < 0x8) || (boot_cpu_data.x86_mask < 0x1))) return; @@ -744,7 +744,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, unsigned int index) { int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); + int node = amd_get_nb_id(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned int reg = 0; @@ -771,7 +771,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, size_t count, unsigned int index) { int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); + int node = amd_get_nb_id(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; -- cgit v1.2.2 From cb19060abfdecac0d1eb2d2f0e7d6b7a3f8bc4f4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 18 Feb 2010 19:37:14 +0100 Subject: x86, cacheinfo: Enable L3 CID only on AMD Final stage linking can fail with arch/x86/built-in.o: In function `store_cache_disable': intel_cacheinfo.c:(.text+0xc509): undefined reference to `amd_get_nb_id' arch/x86/built-in.o: In function `show_cache_disable': intel_cacheinfo.c:(.text+0xc7d3): undefined reference to `amd_get_nb_id' when CONFIG_CPU_SUP_AMD is not enabled because the amd_get_nb_id helper is defined in AMD-specific code but also used in generic code (intel_cacheinfo.c). Reorganize the L3 cache index disable code under CONFIG_CPU_SUP_AMD since it is AMD-only anyway. Signed-off-by: Borislav Petkov LKML-Reference: <20100218184210.GF20473@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 186 ++++++++++++++++++---------------- 1 file changed, 98 insertions(+), 88 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index be5f5c28ddfb..d440123c556f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -293,6 +293,13 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); +}; + +#ifdef CONFIG_CPU_SUP_AMD static unsigned int __cpuinit amd_calc_l3_indices(void) { /* @@ -303,7 +310,7 @@ static unsigned int __cpuinit amd_calc_l3_indices(void) int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned int sc0, sc1, sc2, sc3; - u32 val; + u32 val = 0; pci_read_config_dword(dev, 0x1C4, &val); @@ -335,6 +342,94 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) this_leaf->l3_indices = amd_calc_l3_indices(); } +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!dev) + return -EINVAL; + + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "0x%08x\n", reg); +} + +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) + return -EINVAL; + + if (strict_strtoul(buf, 10, &val) < 0) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) + return -EINVAL; + + val |= BIT(30); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); + return count; +} + +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); + +#else /* CONFIG_CPU_SUP_AMD */ +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +{ +}; +#endif /* CONFIG_CPU_SUP_AMD */ + static int __cpuinit cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) @@ -740,88 +835,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, - unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = amd_get_nb_id(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned int reg = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!dev) - return -EINVAL; - - pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "0x%08x\n", reg); -} - -#define SHOW_CACHE_DISABLE(index) \ -static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ -{ \ - return show_cache_disable(this_leaf, buf, index); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = amd_get_nb_id(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned long val = 0; - -#define SUBCACHE_MASK (3UL << 20) -#define SUBCACHE_INDEX 0xfff - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; - - /* do not allow writes outside of allowed bits */ - if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) - return -EINVAL; - - val |= BIT(30); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - /* - * We need to WBINVD on a core on the node containing the L3 cache which - * indices we disable therefore a simple wbinvd() is not sufficient. - */ - wbinvd_on_cpu(cpu); - pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); - return count; -} - -#define STORE_CACHE_DISABLE(index) \ -static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ -{ \ - return store_cache_disable(this_leaf, buf, count, index); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); -}; - #define define_one_ro(_name) \ static struct _cache_attr _name = \ __ATTR(_name, 0444, show_##_name, NULL) @@ -836,11 +849,6 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); - #define DEFAULT_SYSFS_CACHE_ATTRS \ &type.attr, \ &level.attr, \ @@ -859,8 +867,10 @@ static struct attribute *default_attrs[] = { static struct attribute *default_l3_attrs[] = { DEFAULT_SYSFS_CACHE_ATTRS, +#ifdef CONFIG_CPU_SUP_AMD &cache_disable_0.attr, &cache_disable_1.attr, +#endif NULL }; -- cgit v1.2.2 From eb572a5c7951288e265b3e8f9a5d37b6abb2e996 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 18 Feb 2010 22:15:04 -0800 Subject: x86-64, setup: Inhibit decompressor output if video info is invalid Inhibit output from the kernel decompressor if the video information is invalid. This was already the case for 32 bits, make 64 bits match. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/boot/compressed/misc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 3b22fe8ab91b..3487e86ed3cc 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -185,11 +185,9 @@ static void __putstr(int error, const char *s) return; #endif -#ifdef CONFIG_X86_32 if (real_mode->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) return; -#endif x = real_mode->screen_info.orig_x; y = real_mode->screen_info.orig_y; -- cgit v1.2.2 From 84d710926797a6e317e7e94654a3ccd771cfd8a3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 18 Feb 2010 16:00:59 +0100 Subject: hw-breakpoints: Accept breakpoints on NULL address Before we had a generic breakpoint API, ptrace was accepting breakpoints on NULL address in x86. The new API refuse them, without given strong reasons. We need to follow the previous behaviour as some userspace apps like Wine need such NULL breakpoints to ensure old emulated software protections are still working. This fixes a 2.6.32 - 2.6.33-x ptrace regression. Reported-and-tested-by: Michael Stefaniuc Signed-off-by: Frederic Weisbecker Acked-by: K.Prasad Acked-by: Roland McGrath Cc: Alan Stern Cc: Maneesh Soni Cc: Alexandre Julliard Cc: Rafael J. Wysocki Cc: Maciej Rutecki --- arch/x86/kernel/hw_breakpoint.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 05d5fec64a94..bb6006e3e295 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } -/* - * Store a breakpoint's encoded address, length, and type. - */ -static int arch_store_info(struct perf_event *bp) -{ - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); - if (info->address) - return 0; - - return -EINVAL; -} - int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { @@ -362,10 +343,13 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - ret = arch_store_info(bp); - - if (ret < 0) - return ret; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. -- cgit v1.2.2 From 326264a02448b0ac51f78f178b78e830aa077a0b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 18 Feb 2010 18:24:18 +0100 Subject: hw-breakpoint: Keep track of dr7 local enable bits When the user enables breakpoints through dr7, he can choose between "local" or "global" enable bits but given how linux is implemented, both have the same effect. That said we don't keep track how the user enabled the breakpoints so when the user requests the dr7 value, we only translate the "enabled" status using the global enabled bits. It means that if the user enabled a breakpoint using the local enabled bit, reading back dr7 will set the global bit and clear the local one. Apps like Wine expect a full dr7 POKEUSER/PEEKUSER match for emulated softwares that implement old reverse engineering protection schemes. We fix that by keeping track of the whole dr7 value given by the user in the thread structure to drop this bug. We'll think about something more proper later. This fixes a 2.6.32 - 2.6.33-x ptrace regression. Reported-and-tested-by: Michael Stefaniuc Signed-off-by: Frederic Weisbecker Acked-by: K.Prasad Cc: Alan Stern Cc: Maneesh Soni Cc: Alexandre Julliard Cc: Rafael J. Wysocki Cc: Maciej Rutecki --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/ptrace.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fc801bab1b3b..b753ea59703a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -450,6 +450,8 @@ struct thread_struct { struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ unsigned long debugreg6; + /* Keep track of the exact dr7 value set by the user */ + unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..0c1033d61e59 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -702,7 +702,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { - val = ptrace_get_dr7(thread->ptrace_bps); + val = thread->ptrace_dr7; } return val; } @@ -778,8 +778,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) return rc; } /* All that's left is DR7 */ - if (n == 7) + if (n == 7) { rc = ptrace_write_dr7(tsk, val); + if (!rc) + thread->ptrace_dr7 = val; + } ret_path: return rc; -- cgit v1.2.2 From 8e92dc767abb58357e696a48fc3d8ce615a9c01a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 19 Feb 2010 13:21:38 -0800 Subject: x86, setup: Don't skip mode setting for the standard VGA modes The code for setting standard VGA modes probes for the current mode, and skips the mode setting if the mode is 3 (color text 80x25) or 7 (mono text 80x25). Unfortunately, there are BIOSes, including the VMware BIOS, which report the previous mode if function 0F is queried while the screen is in a VESA mode, and of course, nothing can help a mode poked directly into the hardware. As such, the safe option is to set the mode anyway, and only query to see if we should be using mode 7 rather than mode 3. People who don't want any mode setting at all should probably use vga=0x0f04 (VIDEO_CURRENT_MODE). It's possible that should be the kernel default. Reported-by Rene Arends Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/boot/video-vga.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 819caa1f2008..ed7aeff786b2 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -42,22 +42,15 @@ static u8 vga_set_basic_mode(void) { struct biosregs ireg, oreg; u16 ax; - u8 rows; u8 mode; initregs(&ireg); + /* Query current mode */ ax = 0x0f00; intcall(0x10, &ireg, &oreg); mode = oreg.al; - set_fs(0); - rows = rdfs8(0x484); /* rows minus one */ - - if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && - (rows == 0 || rows == 24)) - return mode; - if (mode != 3 && mode != 7) mode = 3; -- cgit v1.2.2 From b72d0db9dd41da1f2ec6274b03e8909583c64e41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 16:24:51 +0200 Subject: x86: Move pci init function to x86_init The PCI initialization in pci_subsys_init() is a mess. pci_numaq_init, pci_acpi_init, pci_visws_init and pci_legacy_init are called and each implementation checks and eventually modifies the global variable pcibios_scanned. x86_init functions allow us to do this more elegant. The pci.init function pointer is preset to pci_legacy_init. numaq, acpi and visws can modify the pointer in their early setup functions. The functions return 0 when they did the full initialization including bus scan. A non zero return value indicates that pci_legacy_init needs to be called either because the selected function failed or wants the generic bus scan in pci_legacy_init to happen (e.g. visws). Signed-off-by: Thomas Gleixner LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80CFE@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numaq.h | 1 + arch/x86/include/asm/pci.h | 9 ++++++++- arch/x86/include/asm/pci_x86.h | 14 +++++++++++--- arch/x86/include/asm/setup.h | 2 -- arch/x86/include/asm/visws/cobalt.h | 2 ++ arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/acpi/boot.c | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 1 + arch/x86/kernel/visws_quirks.c | 6 +----- arch/x86/kernel/x86_init.c | 5 +++++ arch/x86/pci/acpi.c | 6 +----- arch/x86/pci/common.c | 6 ------ arch/x86/pci/legacy.c | 22 ++++++++-------------- arch/x86/pci/numaq_32.c | 6 ------ arch/x86/pci/visws.c | 6 ++---- 15 files changed, 53 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 9f0a5f5d29ec..ef6bf81f8460 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -30,6 +30,7 @@ extern int found_numaq; extern int get_memcfg_numaq(void); +extern int pci_numaq_init(void); extern void *xquad_portio; diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index ada8c201d513..8bd433ccc242 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus) #ifdef CONFIG_PCI extern unsigned int pcibios_assign_all_busses(void); +extern int pci_legacy_init(void); +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif #else -#define pcibios_assign_all_busses() 0 +# define pcibios_assign_all_busses() 0 +# define x86_default_pci_init NULL #endif extern unsigned long pci_mem_start; diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b4bf9a942ed0..440124f1224d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -82,7 +82,6 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; -extern int pcibios_scanned; extern spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); @@ -112,9 +111,8 @@ extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); extern int __init pcibios_irq_init(void); -extern int __init pci_visws_init(void); -extern int __init pci_numaq_init(void); extern int __init pcibios_init(void); +extern int pci_legacy_init(void); /* pci-mmconfig.c */ @@ -182,3 +180,13 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) { asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); } + +#ifdef CONFIG_PCI +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif +#else +# define x86_default_pci_init NULL +#endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 18e496c98ff0..86b1506f4179 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -37,10 +37,8 @@ void setup_bios_corruption_check(void); #ifdef CONFIG_X86_VISWS extern void visws_early_detect(void); -extern int is_visws_box(void); #else static inline void visws_early_detect(void) { } -static inline int is_visws_box(void) { return 0; } #endif extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h index 166adf61e770..2edb37637ead 100644 --- a/arch/x86/include/asm/visws/cobalt.h +++ b/arch/x86/include/asm/visws/cobalt.h @@ -122,4 +122,6 @@ extern char visws_board_type; extern char visws_board_rev; +extern int pci_visws_init(void); + #endif /* _ASM_X86_VISWS_COBALT_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ea0e8ea15e15..f145d843f03d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -98,6 +98,14 @@ struct x86_init_iommu { int (*iommu_init)(void); }; + /* + * struct x86_init_pci - platform specific pci init functions + * @init: platform specific pci init + */ +struct x86_init_pci { + int (*init)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -110,6 +118,7 @@ struct x86_init_ops { struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; + struct x86_init_pci pci; }; /** diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdfa5ca4..054a5f5548b0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -1604,6 +1605,9 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (!acpi_noirq) + x86_init.pci.init = pci_acpi_init; + return 0; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 98c4665f251c..be5c4fd47778 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -277,6 +277,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; x86_init.timers.tsc_pre_init = numaq_tsc_init; + x86_init.pci.init = pci_numaq_init; } } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 34a279a7471d..843e9d30a1e3 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -49,11 +49,6 @@ extern int no_broadcast; char visws_board_type = -1; char visws_board_rev = -1; -int is_visws_box(void) -{ - return visws_board_type >= 0; -} - static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -242,6 +237,7 @@ void __init visws_early_detect(void) x86_init.irqs.pre_vector_init = visws_pre_intr_init; x86_init.irqs.trap_init = visws_trap_init; x86_init.timers.timer_init = visws_time_init; + x86_init.pci.init = pci_visws_init; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ccd179dec36e..81faa6d67d69 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -70,6 +71,10 @@ struct x86_init_ops x86_init __initdata = { .iommu = { .iommu_init = iommu_init_noop, }, + + .pci = { + .init = x86_default_pci_init, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 959e548a7039..73b3fe9aa716 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -282,15 +282,11 @@ int __init pci_acpi_init(void) { struct pci_dev *dev = NULL; - if (pcibios_scanned) - return 0; - if (acpi_noirq) - return 0; + return -ENODEV; printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); acpi_irq_penalty_init(); - pcibios_scanned++; pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index d2552c68e94d..f5770b5846a6 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -71,12 +71,6 @@ struct pci_ops pci_root_ops = { .write = pci_write, }; -/* - * legacy, numa, and acpi all want to call pcibios_scan_root - * from their initcalls. This flag prevents that. - */ -int pcibios_scanned; - /* * This interrupt-safe spinlock protects all accesses to PCI * configuration space. diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 4061bb0f267d..0daf264ddb6c 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void) } } -static int __init pci_legacy_init(void) +int __init pci_legacy_init(void) { if (!raw_pci_ops) { printk("PCI: System does not support PCI\n"); return 0; } - if (pcibios_scanned++) - return 0; - printk("PCI: Probing PCI hardware\n"); pci_root_bus = pcibios_scan_root(0); if (pci_root_bus) @@ -55,16 +52,13 @@ static int __init pci_legacy_init(void) int __init pci_subsys_init(void) { -#ifdef CONFIG_X86_NUMAQ - pci_numaq_init(); -#endif -#ifdef CONFIG_ACPI - pci_acpi_init(); -#endif -#ifdef CONFIG_X86_VISWS - pci_visws_init(); -#endif - pci_legacy_init(); + /* + * The init function returns an non zero value when + * pci_legacy_init should be invoked. + */ + if (x86_init.pci.init()) + pci_legacy_init(); + pcibios_fixup_peer_bridges(); pcibios_irq_init(); pcibios_init(); diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8eb295e116f6..45c0c9e45903 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -152,14 +152,8 @@ int __init pci_numaq_init(void) { int quad; - if (!found_numaq) - return 0; - raw_pci_ops = &pci_direct_conf1_mq; - if (pcibios_scanned++) - return 0; - pci_root_bus = pcibios_scan_root(0); if (pci_root_bus) pci_bus_add_devices(pci_root_bus); diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index bcead7a46871..03008f72eb04 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) int __init pci_visws_init(void) { - if (!is_visws_box()) - return -1; - pcibios_enable_irq = &pci_visws_enable_irq; pcibios_disable_irq = &pci_visws_disable_irq; @@ -90,5 +87,6 @@ int __init pci_visws_init(void) pci_scan_bus_with_sysdata(pci_bus1); pci_fixup_irqs(pci_common_swizzle, visws_map_irq); pcibios_resource_survey(); - return 0; + /* Request bus scan */ + return 1; } -- cgit v1.2.2 From ab3b37937e8f4fb38dc9780b7bc3fd3c5195cca3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 17:47:33 +0200 Subject: x86: Add pci_init_irq to x86_init Moorestown wants to reuse pcibios_init_irq but needs to provide its own implementation of pci_enable_irq. After we distangled the init we can move the init_irq call to x86_init and remove the pci_enable_irq != NULL check in pcibios_init_irq. pci_enable_irq is compile time initialized to pirq_enable_irq and the special cases which override it (visws and acpi) set the x86_init function pointer to noop. That allows MSRT to override pci_enable_irq and otherwise run pcibios_init_irq unmodified. Signed-off-by: Thomas Gleixner LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80CFF@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pci_x86.h | 4 +++- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/visws_quirks.c | 1 + arch/x86/kernel/x86_init.c | 1 + arch/x86/pci/acpi.c | 1 + arch/x86/pci/irq.c | 12 ++++-------- arch/x86/pci/legacy.c | 2 +- 7 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 440124f1224d..46511c5be456 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -110,7 +110,7 @@ extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); -extern int __init pcibios_irq_init(void); +extern void __init pcibios_irq_init(void); extern int __init pcibios_init(void); extern int pci_legacy_init(void); @@ -187,6 +187,8 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) # else # define x86_default_pci_init pci_legacy_init # endif +# define x86_default_pci_init_irq pcibios_irq_init #else # define x86_default_pci_init NULL +# define x86_default_pci_init_irq NULL #endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f145d843f03d..34f61cd56f3b 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -101,9 +101,11 @@ struct x86_init_iommu { /* * struct x86_init_pci - platform specific pci init functions * @init: platform specific pci init + * @init_irq: platform specific pci irq init */ struct x86_init_pci { int (*init)(void); + void (*init_irq)(void); }; /** diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 843e9d30a1e3..b48ef6c0d716 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -238,6 +238,7 @@ void __init visws_early_detect(void) x86_init.irqs.trap_init = visws_trap_init; x86_init.timers.timer_init = visws_time_init; x86_init.pci.init = pci_visws_init; + x86_init.pci.init_irq = x86_init_noop; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 81faa6d67d69..203f26fb7f33 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -74,6 +74,7 @@ struct x86_init_ops x86_init __initdata = { .pci = { .init = x86_default_pci_init, + .init_irq = x86_default_pci_init_irq, }, }; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 73b3fe9aa716..b53f0487e2d3 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -289,6 +289,7 @@ int __init pci_acpi_init(void) acpi_irq_penalty_init(); pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; + x86_init.pci.init_irq = x86_init_noop; if (pci_routeirq) { /* diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0696d506c4ad..0f40ff20dd67 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -53,7 +53,7 @@ struct irq_router_handler { int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); }; -int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; +int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq; void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; /* @@ -1110,12 +1110,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = { { } }; -int __init pcibios_irq_init(void) +void __init pcibios_irq_init(void) { DBG(KERN_DEBUG "PCI: IRQ init\n"); - if (pcibios_enable_irq || raw_pci_ops == NULL) - return 0; + if (raw_pci_ops == NULL) + return; dmi_check_system(pciirq_dmi_table); @@ -1142,8 +1142,6 @@ int __init pcibios_irq_init(void) pirq_table = NULL; } - pcibios_enable_irq = pirq_enable_irq; - pcibios_fixup_irqs(); if (io_apic_assign_pci_irqs && pci_routeirq) { @@ -1157,8 +1155,6 @@ int __init pcibios_irq_init(void) for_each_pci_dev(dev) pirq_enable_irq(dev); } - - return 0; } static void pirq_penalize_isa_irq(int irq, int active) diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 0daf264ddb6c..0db5eaf54560 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -60,7 +60,7 @@ int __init pci_subsys_init(void) pci_legacy_init(); pcibios_fixup_peer_bridges(); - pcibios_irq_init(); + x86_init.pci.init_irq(); pcibios_init(); return 0; -- cgit v1.2.2 From 9325a28ce2fa7c597e5ed41455a06c30b82b5710 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 17:51:26 +0200 Subject: x86: Add pcibios_fixup_irqs to x86_init Platforms like Moorestown want to override the pcibios_fixup_irqs default function. Add it to x86_init.pci. Signed-off-by: Thomas Gleixner LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D00@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pci_x86.h | 3 +++ arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/x86_init.c | 2 ++ arch/x86/pci/irq.c | 4 ++-- 4 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 46511c5be456..6e69edfbf074 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -113,6 +113,7 @@ extern int __init pci_acpi_init(void); extern void __init pcibios_irq_init(void); extern int __init pcibios_init(void); extern int pci_legacy_init(void); +extern void pcibios_fixup_irqs(void); /* pci-mmconfig.c */ @@ -188,7 +189,9 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) # define x86_default_pci_init pci_legacy_init # endif # define x86_default_pci_init_irq pcibios_irq_init +# define x86_default_pci_fixup_irqs pcibios_fixup_irqs #else # define x86_default_pci_init NULL # define x86_default_pci_init_irq NULL +# define x86_default_pci_fixup_irqs NULL #endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 34f61cd56f3b..8ef56f21f9f0 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -102,10 +102,12 @@ struct x86_init_iommu { * struct x86_init_pci - platform specific pci init functions * @init: platform specific pci init * @init_irq: platform specific pci irq init + * @fixup_irqs: platform specific pci irq fixup */ struct x86_init_pci { int (*init)(void); void (*init_irq)(void); + void (*fixup_irqs)(void); }; /** diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 203f26fb7f33..1817cd7a03fa 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -4,6 +4,7 @@ * For licencing details see kernel-base/COPYING */ #include +#include #include #include @@ -75,6 +76,7 @@ struct x86_init_ops x86_init __initdata = { .pci = { .init = x86_default_pci_init, .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, }, }; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0f40ff20dd67..a60deb6e6696 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1016,7 +1016,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 1; } -static void __init pcibios_fixup_irqs(void) +void __init pcibios_fixup_irqs(void) { struct pci_dev *dev = NULL; u8 pin; @@ -1142,7 +1142,7 @@ void __init pcibios_irq_init(void) pirq_table = NULL; } - pcibios_fixup_irqs(); + x86_init.pci.fixup_irqs(); if (io_apic_assign_pci_irqs && pci_routeirq) { struct pci_dev *dev = NULL; -- cgit v1.2.2 From d39f6495f66616b637260405d0b5dc2656bc490e Mon Sep 17 00:00:00 2001 From: Alek Du Date: Mon, 7 Sep 2009 16:25:45 +0800 Subject: x86, ioapic: Improve handling of i8259A irq init Since we already track the number of legacy vectors by nr_legacy_irqs, we can avoid use static vector allocations -- we can use dynamic one. Signed-off-by: Alek Du LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D01@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816d..75265ab83b17 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -140,27 +140,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg irq_cfgx[] = { +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; #else -static struct irq_cfg irq_cfgx[NR_IRQS] = { +static struct irq_cfg irq_cfgx[NR_IRQS]; #endif - [0] = { .vector = IRQ0_VECTOR, }, - [1] = { .vector = IRQ1_VECTOR, }, - [2] = { .vector = IRQ2_VECTOR, }, - [3] = { .vector = IRQ3_VECTOR, }, - [4] = { .vector = IRQ4_VECTOR, }, - [5] = { .vector = IRQ5_VECTOR, }, - [6] = { .vector = IRQ6_VECTOR, }, - [7] = { .vector = IRQ7_VECTOR, }, - [8] = { .vector = IRQ8_VECTOR, }, - [9] = { .vector = IRQ9_VECTOR, }, - [10] = { .vector = IRQ10_VECTOR, }, - [11] = { .vector = IRQ11_VECTOR, }, - [12] = { .vector = IRQ12_VECTOR, }, - [13] = { .vector = IRQ13_VECTOR, }, - [14] = { .vector = IRQ14_VECTOR, }, - [15] = { .vector = IRQ15_VECTOR, }, -}; void __init io_apic_disable_legacy(void) { @@ -181,6 +164,8 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { + if (i < nr_legacy_irqs) + cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); -- cgit v1.2.2 From 35f720c5930f689647d51ad77e2a8d6f0abf66c8 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 17 Sep 2009 07:36:43 -0700 Subject: x86: Initialize stack canary in secondary start Some secondary clockevent setup code needs to call request_irq, which will cause fake stack check failure in schedule() if voluntary preemption model is chosen. It is safe to have stack canary initialized here early, since start_secondary() does not return. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D02@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b4e870cbdc60..3e6150d421e4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -324,6 +325,9 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); + /* to prevent fake stack check failure in clock setup */ + boot_init_stack_canary(); + x86_cpuinit.setup_percpu_clockev(); wmb(); -- cgit v1.2.2 From ef3548668c02cc8c3922f4423f32b53e662811c6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 9 Nov 2009 11:24:14 -0800 Subject: x86, pic: Introduce legacy_pic abstraction This patch makes i8259A like legacy programmable interrupt controller code into a driver so that legacy pic functions can be selected at runtime based on platform information, such as HW subarchitecure ID. Default structure of legacy_pic maintains the current code path for x86pc. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D03@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i8259.h | 13 +++++++++++++ arch/x86/kernel/i8259.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 58d7091eeb1f..e8a3e05c2882 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -57,6 +57,19 @@ static inline void outb_pic(unsigned char value, unsigned int port) extern struct irq_chip i8259A_chip; +struct legacy_pic { + int nr_legacy_irqs; + struct irq_chip *chip; + void (*mask_all)(void); + void (*restore_mask)(void); + void (*init)(int auto_eoi); + int (*irq_pending)(unsigned int irq); + void (*make_irq)(unsigned int irq); +}; + +extern struct legacy_pic *legacy_pic; +extern struct legacy_pic null_legacy_pic; + extern void mask_8259A(void); extern void unmask_8259A(void); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102bef80..b80987ca33ea 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -358,3 +358,46 @@ void init_8259A(int auto_eoi) spin_unlock_irqrestore(&i8259A_lock, flags); } +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void __init legacy_pic_noop(void) { }; +static void __init legacy_pic_uint_noop(unsigned int unused) { }; +static void __init legacy_pic_int_noop(int unused) { }; + +static struct irq_chip dummy_pic_chip = { + .name = "dummy pic", + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, + .disable = legacy_pic_uint_noop, + .mask_ack = legacy_pic_uint_noop, +}; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ + return 0; +} + +struct legacy_pic null_legacy_pic = { + .nr_legacy_irqs = 0, + .chip = &dummy_pic_chip, + .mask_all = legacy_pic_noop, + .restore_mask = legacy_pic_noop, + .init = legacy_pic_int_noop, + .irq_pending = legacy_pic_irq_pending_noop, + .make_irq = legacy_pic_uint_noop, +}; + +struct legacy_pic default_legacy_pic = { + .nr_legacy_irqs = NR_IRQS_LEGACY, + .chip = &i8259A_chip, + .mask_all = mask_8259A, + .restore_mask = unmask_8259A, + .init = init_8259A, + .irq_pending = i8259A_irq_pending, + .make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; -- cgit v1.2.2 From b81bb373a7e832a43921356aa1291044d7f52fb1 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 9 Nov 2009 11:27:04 -0800 Subject: x86, pic: Make use of legacy_pic abstraction This patch replaces legacy PIC-related global variable and functions with the new legacy_pic abstraction. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D04@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hw_irq.h | 7 ------ arch/x86/include/asm/i8259.h | 8 ------ arch/x86/kernel/apic/apic.c | 8 +++--- arch/x86/kernel/apic/io_apic.c | 57 ++++++++++++++++++++---------------------- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/i8259.c | 21 ++++++++++------ arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/smpboot.c | 5 ++-- arch/x86/kernel/visws_quirks.c | 14 +++++++---- 9 files changed, 59 insertions(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eeac829a0f44..a929c9ede33d 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -53,13 +53,6 @@ extern void threshold_interrupt(void); extern void call_function_interrupt(void); extern void call_function_single_interrupt(void); -/* PIC specific functions */ -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); - /* IOAPIC */ #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) extern unsigned long io_apic_irqs; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index e8a3e05c2882..2832babd91fc 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -26,11 +26,6 @@ extern unsigned int cached_irq_mask; extern spinlock_t i8259A_lock; -extern void init_8259A(int auto_eoi); -extern void enable_8259A_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern unsigned int startup_8259A_irq(unsigned int irq); - /* the PIC may need a careful delay on some platforms, hence specific calls */ static inline unsigned char inb_pic(unsigned int port) { @@ -70,7 +65,4 @@ struct legacy_pic { extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; -extern void mask_8259A(void); -extern void unmask_8259A(void); - #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dfca210f6a10..94f22b12858d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1390,7 +1390,7 @@ void __init enable_IR_x2apic(void) } local_irq_save(flags); - mask_8259A(); + legacy_pic->mask_all(); mask_IO_APIC_setup(ioapic_entries); if (dmar_table_init_ret) @@ -1422,7 +1422,7 @@ void __init enable_IR_x2apic(void) nox2apic: if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); + legacy_pic->restore_mask(); local_irq_restore(flags); out: @@ -2018,7 +2018,7 @@ static int lapic_resume(struct sys_device *dev) } mask_IO_APIC_setup(ioapic_entries); - mask_8259A(); + legacy_pic->mask_all(); } if (x2apic_mode) @@ -2062,7 +2062,7 @@ static int lapic_resume(struct sys_device *dev) if (intr_remapping_enabled) { reenable_intr_remapping(x2apic_mode); - unmask_8259A(); + legacy_pic->restore_mask(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 75265ab83b17..1704cd82db5f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -94,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* Number of legacy interrupts */ -static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; @@ -147,7 +145,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS]; void __init io_apic_disable_legacy(void) { - nr_legacy_irqs = 0; nr_irqs_gsi = 0; } @@ -164,13 +161,13 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { - if (i < nr_legacy_irqs) + if (i < legacy_pic->nr_legacy_irqs) cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < nr_legacy_irqs) + if (i < legacy_pic->nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -850,7 +847,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < nr_legacy_irqs) { + if (irq < legacy_pic->nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1446,8 +1443,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < nr_legacy_irqs) - disable_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->chip->mask(irq); ioapic_write_entry(apic_id, pin, entry); } @@ -1810,7 +1807,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1894,7 +1891,7 @@ void __init enable_IO_APIC(void) nr_ioapic_registers[apic] = reg_01.bits.entries+1; } - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; for(apic = 0; apic < nr_ioapics; apic++) { @@ -1951,7 +1948,7 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; /* @@ -2184,9 +2181,9 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < nr_legacy_irqs) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) + if (irq < legacy_pic->nr_legacy_irqs) { + legacy_pic->chip->mask(irq); + if (legacy_pic->irq_pending(irq)) was_pending = 1; } cfg = irq_cfg(irq); @@ -2719,8 +2716,8 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < nr_legacy_irqs) - make_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; @@ -2877,7 +2874,7 @@ static inline void __init check_timer(void) /* * get/set the timer IRQ vector: */ - disable_8259A_irq(0); + legacy_pic->chip->mask(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -2890,7 +2887,7 @@ static inline void __init check_timer(void) * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); + legacy_pic->init(1); #ifdef CONFIG_X86_32 { unsigned int ver; @@ -2949,7 +2946,7 @@ static inline void __init check_timer(void) if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2972,14 +2969,14 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } goto out; } @@ -2987,7 +2984,7 @@ static inline void __init check_timer(void) * Cleanup, just in case ... */ local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -3006,22 +3003,22 @@ static inline void __init check_timer(void) lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as ExtINT IRQ...\n"); - init_8259A(0); - make_8259A_irq(0); + legacy_pic->init(0); + legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); @@ -3063,7 +3060,7 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* @@ -3074,7 +3071,7 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - if (nr_legacy_irqs) + if (legacy_pic->nr_legacy_irqs) check_timer(); } @@ -3875,7 +3872,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= nr_legacy_irqs) { + if (irq >= legacy_pic->nr_legacy_irqs) { cfg = desc->chip_data; if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396cb..3817739acee9 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -177,7 +177,7 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC) { if (!timer_through_8259) - disable_8259A_irq(0); + legacy_pic->chip->mask(0); on_each_cpu(__acpi_nmi_disable, NULL, 1); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index b80987ca33ea..1c790e75f7a0 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -34,6 +34,12 @@ static int i8259A_auto_eoi; DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +static void mask_8259A(void); +static void unmask_8259A(void); +static void disable_8259A_irq(unsigned int irq); +static void enable_8259A_irq(unsigned int irq); +static void init_8259A(int auto_eoi); +static int i8259A_irq_pending(unsigned int irq); struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -63,7 +69,7 @@ unsigned int cached_irq_mask = 0xffff; */ unsigned long io_apic_irqs; -void disable_8259A_irq(unsigned int irq) +static void disable_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; @@ -77,7 +83,7 @@ void disable_8259A_irq(unsigned int irq) spin_unlock_irqrestore(&i8259A_lock, flags); } -void enable_8259A_irq(unsigned int irq) +static void enable_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; @@ -91,7 +97,7 @@ void enable_8259A_irq(unsigned int irq) spin_unlock_irqrestore(&i8259A_lock, flags); } -int i8259A_irq_pending(unsigned int irq) +static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<init(0); /* * 16 old-style INTA-cycle interrupts: diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3e6150d421e4..f7a52f4a21a5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,6 +68,7 @@ #include #include +#include #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; @@ -287,9 +288,9 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); enable_NMI_through_LVT0(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index b48ef6c0d716..f067e9556a47 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -505,7 +505,7 @@ static struct irq_chip cobalt_irq_type = { */ static unsigned int startup_piix4_master_irq(unsigned int irq) { - init_8259A(0); + legacy_pic->init(0); return startup_cobalt_irq(irq); } @@ -529,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { .name = "PIIX4-virtual", - .shutdown = disable_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, }; @@ -606,7 +603,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) handle_IRQ_event(realirq, desc->action); if (!(desc->status & IRQ_DISABLED)) - enable_8259A_irq(realirq); + legacy_pic->chip->unmask(realirq); return IRQ_HANDLED; @@ -625,6 +622,12 @@ static struct irqaction cascade_action = { .name = "cascade", }; +static inline void set_piix4_virtual_irq_type(void) +{ + piix4_virtual_irq_type.shutdown = i8259A_chip.mask; + piix4_virtual_irq_type.enable = i8259A_chip.unmask; + piix4_virtual_irq_type.disable = i8259A_chip.mask; +} void init_VISWS_APIC_irqs(void) { @@ -650,6 +653,7 @@ void init_VISWS_APIC_irqs(void) desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { + set_piix4_virtual_irq_type(); desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { -- cgit v1.2.2 From 1f91233c26fd5f7d6525fd29b95e4b50ca7a3e88 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 5 Feb 2010 04:06:56 -0800 Subject: x86, apic: Remove ioapic_disable_legacy() The ioapic_disable_legacy() call is no longer needed for platforms do not have legacy pic. the legacy pic abstraction has taken care it automatically. This patch also initialize irq-related static variables based on information obtained from legacy_pic. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A30A7660@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_apic.h | 2 -- arch/x86/kernel/apic/io_apic.c | 10 +++++----- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 7c7c16cde1f8..84fdd5110948 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -143,8 +143,6 @@ extern int noioapicreroute; /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; -extern void io_apic_disable_legacy(void); - /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1704cd82db5f..3592a72f3f0a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -143,11 +143,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; static struct irq_cfg irq_cfgx[NR_IRQS]; #endif -void __init io_apic_disable_legacy(void) -{ - nr_irqs_gsi = 0; -} - int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -156,6 +151,11 @@ int __init arch_early_irq_init(void) int node; int i; + if (!legacy_pic->nr_legacy_irqs) { + nr_irqs_gsi = 0; + io_apic_irqs = ~0UL; + } + cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); node= cpu_to_node(boot_cpu_id); -- cgit v1.2.2 From 4b3073e1c53a256275f1079c0fbfbe85883d9275 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 18 Dec 2009 16:40:18 +0000 Subject: MM: Pass a PTE pointer to update_mmu_cache() rather than the PTE itself On VIVT ARM, when we have multiple shared mappings of the same file in the same MM, we need to ensure that we have coherency across all copies. We do this via make_coherent() by making the pages uncacheable. This used to work fine, until we allowed highmem with highpte - we now have a page table which is mapped as required, and is not available for modification via update_mmu_cache(). Ralf Beache suggested getting rid of the PTE value passed to update_mmu_cache(): On MIPS update_mmu_cache() calls __update_tlb() which walks pagetables to construct a pointer to the pte again. Passing a pte_t * is much more elegant. Maybe we might even replace the pte argument with the pte_t? Ben Herrenschmidt would also like the pte pointer for PowerPC: Passing the ptep in there is exactly what I want. I want that -instead- of the PTE value, because I have issue on some ppc cases, for I$/D$ coherency, where set_pte_at() may decide to mask out the _PAGE_EXEC. So, pass in the mapped page table pointer into update_mmu_cache(), and remove the PTE value, updating all implementations and call sites to suit. Includes a fix from Stephen Rothwell: sparc: fix fallout from update_mmu_cache API change Signed-off-by: Stephen Rothwell Acked-by: Benjamin Herrenschmidt Signed-off-by: Russell King --- arch/x86/include/asm/pgtable_32.h | 2 +- arch/x86/include/asm/pgtable_64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 01fd9461d323..a28668396508 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -80,7 +80,7 @@ do { \ * The i386 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. */ -#define update_mmu_cache(vma, address, pte) do { } while (0) +#define update_mmu_cache(vma, address, ptep) do { } while (0) #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index c57a30117149..181be528c612 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -129,7 +129,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) /* NOP */ #define pte_unmap_nested(pte) /* NOP */ -#define update_mmu_cache(vma, address, pte) do { } while (0) +#define update_mmu_cache(vma, address, ptep) do { } while (0) /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE -- cgit v1.2.2 From 281ff33b7c1b1ba2a5f9b03425e5f692a94913fa Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 18 Feb 2010 11:51:40 -0800 Subject: x86_64, cpa: Don't work hard in preserving kernel 2M mappings when using 4K already We currently enforce the !RW mapping for the kernel mapping that maps holes between different text, rodata and data sections. However, kernel identity mappings will have different RWX permissions to the pages mapping to text and to the pages padding (which are freed) the text, rodata sections. Hence kernel identity mappings will be broken to smaller pages. For 64-bit, kernel text and kernel identity mappings are different, so we can enable protection checks that come with CONFIG_DEBUG_RODATA, as well as retain 2MB large page mappings for kernel text. Konrad reported a boot failure with the Linux Xen paravirt guest because of this. In this paravirt guest case, the kernel text mapping and the kernel identity mapping share the same page-table pages. Thus forcing the !RW mapping for some of the kernel mappings also cause the kernel identity mappings to be read-only resulting in the boot failure. Linux Xen paravirt guest also uses 4k mappings and don't use 2M mapping. Fix this issue and retain large page performance advantage for native kernels by not working hard and not enforcing !RW for the kernel text mapping, if the current mapping is already using small page mapping. Reported-by: Konrad Rzeszutek Wilk Signed-off-by: Suresh Siddha LKML-Reference: <1266522700.2909.34.camel@sbs-t61.sc.intel.com> Tested-by: Konrad Rzeszutek Wilk Cc: stable@kernel.org [2.6.32, 2.6.33] Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1d4eb93d333c..cf07c26d9a4a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -291,8 +291,29 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, */ if (kernel_set_to_readonly && within(address, (unsigned long)_text, - (unsigned long)__end_rodata_hpage_align)) - pgprot_val(forbidden) |= _PAGE_RW; + (unsigned long)__end_rodata_hpage_align)) { + unsigned int level; + + /* + * Don't enforce the !RW mapping for the kernel text mapping, + * if the current mapping is already using small page mapping. + * No need to work hard to preserve large page mappings in this + * case. + * + * This also fixes the Linux Xen paravirt guest boot failure + * (because of unexpected read-only mappings for kernel identity + * mappings). In this paravirt guest case, the kernel text + * mapping and the kernel identity mapping share the same + * page-table pages. Thus we can't really use different + * protections for the kernel text and identity mappings. Also, + * these shared mappings are made of small page mappings. + * Thus this don't enforce !RW mapping for small page kernel + * text mapping logic will help Linux Xen parvirt guest boot + * aswell. + */ + if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) + pgprot_val(forbidden) |= _PAGE_RW; + } #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); -- cgit v1.2.2 From 93da6202264ce1256b04db8008a43882ae62d060 Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Tue, 12 Jan 2010 16:56:37 -0800 Subject: x86/PCI: irq and pci_ids patch for Intel Cougar Point DeviceIDs This patch adds the Intel Cougar Point (PCH) LPC and SMBus Controller DeviceIDs. Signed-off-by: Seth Heasley Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0696d506c4ad..b02f6d8ac922 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -590,6 +590,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: + case PCI_DEVICE_ID_INTEL_CPT_LPC1: + case PCI_DEVICE_ID_INTEL_CPT_LPC2: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; -- cgit v1.2.2 From b26b2d494b659f988b4d75eb394dfa0ddac415c9 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Fri, 1 Jan 2010 17:40:49 +0100 Subject: resource/PCI: align functions now return start of resource As suggested by Linus, align functions should return the start of a resource, not void. An update of "res->start" is no longer necessary. Cc: Bjorn Helgaas Cc: Yinghai Lu Signed-off-by: Dominik Brodowski Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 5dc9e8c63fcd..924e40c916d3 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -60,22 +60,20 @@ skip_isa_ioresource_align(struct pci_dev *dev) { * but we want to try to avoid allocating at 0x2900-0x2bff * which might have be mirrored at 0x0100-0x03ff.. */ -void +resource_size_t pcibios_align_resource(void *data, struct resource *res, resource_size_t size, resource_size_t align) { struct pci_dev *dev = data; + resource_size_t start = res->start; if (res->flags & IORESOURCE_IO) { - resource_size_t start = res->start; - if (skip_isa_ioresource_align(dev)) - return; - if (start & 0x300) { + return start; + if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; - res->start = start; - } } + return start; } EXPORT_SYMBOL(pcibios_align_resource); -- cgit v1.2.2 From 3b7a17fcdae532d29dffab9d564a28be08960988 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Fri, 1 Jan 2010 17:40:50 +0100 Subject: resource/PCI: mark struct resource as const Now that we return the new resource start position, there is no need to update "struct resource" inside the align function. Therefore, mark the struct resource as const. Cc: Bjorn Helgaas Cc: Yinghai Lu Signed-off-by: Dominik Brodowski Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 924e40c916d3..5a8fbf8d4cac 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -61,7 +61,7 @@ skip_isa_ioresource_align(struct pci_dev *dev) { * which might have be mirrored at 0x0100-0x03ff.. */ resource_size_t -pcibios_align_resource(void *data, struct resource *res, +pcibios_align_resource(void *data, const struct resource *res, resource_size_t size, resource_size_t align) { struct pci_dev *dev = data; -- cgit v1.2.2 From 2fe2abf896c1e7a0ee65faaf3ef0ce654848abbd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 23 Feb 2010 10:24:36 -0700 Subject: PCI: augment bus resource table with a list Previously we used a table of size PCI_BUS_NUM_RESOURCES (16) for resources forwarded to a bus by its upstream bridge. We've increased this size several times when the table overflowed. But there's no good limit on the number of resources because host bridges and subtractive decode bridges can forward any number of ranges to their secondary buses. This patch reduces the table to only PCI_BRIDGE_RESOURCE_NUM (4) entries, which corresponds to the number of windows a PCI-to-PCI (3) or CardBus (4) bridge can positively decode. Any additional resources, e.g., PCI host bridge windows or subtractively-decoded regions, are kept in a list. I'd prefer a single list rather than this split table/list approach, but that requires simultaneous changes to every architecture. This approach only requires immediate changes where we set up (a) host bridges with more than four windows and (b) subtractive-decode P2P bridges, and we can incrementally change other architectures to use the list. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 33 ++++----------------------------- arch/x86/pci/bus_numa.c | 3 ++- arch/x86/pci/bus_numa.h | 3 +-- 3 files changed, 7 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 959e548a7039..a2f8cdb8c1d5 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -45,20 +45,6 @@ count_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static int -bus_has_transparent_bridge(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - u16 class = dev->class >> 8; - - if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) - return true; - } - return false; -} - static void align_resource(struct acpi_device *bridge, struct resource *res) { @@ -92,12 +78,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data) acpi_status status; unsigned long flags; struct resource *root; - int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; u64 start, end; - if (bus_has_transparent_bridge(info->bus)) - max_root_bus_resources -= 3; - status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) return AE_OK; @@ -115,15 +97,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) start = addr.minimum + addr.translation_offset; end = start + addr.address_length - 1; - if (info->res_num >= max_root_bus_resources) { - if (pci_probe & PCI_USE__CRS) - printk(KERN_WARNING "PCI: Failed to allocate " - "0x%lx-0x%lx from %s for %s due to _CRS " - "returning more than %d resource descriptors\n", - (unsigned long) start, (unsigned long) end, - root->name, info->name, max_root_bus_resources); - return AE_OK; - } res = &info->res[info->res_num]; res->name = info->name; @@ -143,7 +116,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) dev_err(&info->bridge->dev, "can't allocate host bridge window %pR\n", res); } else { - info->bus->resource[info->res_num] = res; + pci_bus_add_resource(info->bus, res, 0); info->res_num++; if (addr.translation_offset) dev_info(&info->bridge->dev, "host bridge window %pR " @@ -164,7 +137,9 @@ get_current_resources(struct acpi_device *device, int busnum, struct pci_root_info info; size_t size; - if (!(pci_probe & PCI_USE__CRS)) + if (pci_probe & PCI_USE__CRS) + pci_bus_remove_resources(bus); + else dev_info(&device->dev, "ignoring host bridge windows from ACPI; " "boot with \"pci=use_crs\" to use them\n"); diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index f939d603adfa..12d54ff3654d 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -36,13 +36,14 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", b->number); + pci_bus_remove_resources(b); info = &pci_root_info[i]; for (j = 0; j < info->res_num; j++) { struct resource *res; struct resource *root; res = &info->res[j]; - b->resource[j] = res; + pci_bus_add_resource(b, res, 0); if (res->flags & IORESOURCE_IO) root = &ioport_resource; else diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index adbc23fe82ac..731b64ee8d84 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -2,8 +2,7 @@ /* * sub bus (transparent) will use entres from 3 to store extra from - * root, so need to make sure we have enough slot there, Should we - * increase PCI_BUS_NUM_RESOURCES? + * root, so need to make sure we have enough slot there. */ #define RES_NUM 16 struct pci_root_info { -- cgit v1.2.2 From 7bc5e3f2be32ae6fb0c74cd0f707f986b3a01a26 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 23 Feb 2010 10:24:41 -0700 Subject: x86/PCI: use host bridge _CRS info by default on 2008 and newer machines The main benefit of using ACPI host bridge window information is that we can do better resource allocation in systems with multiple host bridges, e.g., http://bugzilla.kernel.org/show_bug.cgi?id=14183 Sometimes we need _CRS information even if we only have one host bridge, e.g., https://bugs.launchpad.net/ubuntu/+source/linux/+bug/341681 Most of these systems are relatively new, so this patch turns on "pci=use_crs" only on machines with a BIOS date of 2008 or newer. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/acpi.c | 53 +++++++++++++++++++++++++++++++++++++----- arch/x86/pci/common.c | 3 +++ 3 files changed, 51 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b4bf9a942ed0..05b58ccb2e82 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -29,6 +29,7 @@ #define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 #define PCI_HAS_IO_ECS 0x40000 #define PCI_NOASSIGN_ROMS 0x80000 +#define PCI_ROOT_NO_CRS 0x100000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index a2f8cdb8c1d5..5f11ff6f5389 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -15,6 +15,51 @@ struct pci_root_info { int busnum; }; +static bool pci_use_crs = true; + +static int __init set_use_crs(const struct dmi_system_id *id) +{ + pci_use_crs = true; + return 0; +} + +static const struct dmi_system_id pci_use_crs_table[] __initconst = { + /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ + { + .callback = set_use_crs, + .ident = "IBM System x3800", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), + }, + }, + {} +}; + +void __init pci_acpi_crs_quirks(void) +{ + int year; + + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) + pci_use_crs = false; + + dmi_check_system(pci_use_crs_table); + + /* + * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that + * takes precedence over anything we figured out above. + */ + if (pci_probe & PCI_ROOT_NO_CRS) + pci_use_crs = false; + else if (pci_probe & PCI_USE__CRS) + pci_use_crs = true; + + printk(KERN_INFO "PCI: %s host bridge windows from ACPI; " + "if necessary, use \"pci=%s\" and report a bug\n", + pci_use_crs ? "Using" : "Ignoring", + pci_use_crs ? "nocrs" : "use_crs"); +} + static acpi_status resource_to_addr(struct acpi_resource *resource, struct acpi_resource_address64 *addr) @@ -106,7 +151,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->child = NULL; align_resource(info->bridge, res); - if (!(pci_probe & PCI_USE__CRS)) { + if (!pci_use_crs) { dev_printk(KERN_DEBUG, &info->bridge->dev, "host bridge window %pR (ignored)\n", res); return AE_OK; @@ -137,12 +182,8 @@ get_current_resources(struct acpi_device *device, int busnum, struct pci_root_info info; size_t size; - if (pci_probe & PCI_USE__CRS) + if (pci_use_crs) pci_bus_remove_resources(bus); - else - dev_info(&device->dev, - "ignoring host bridge windows from ACPI; " - "boot with \"pci=use_crs\" to use them\n"); info.bridge = device; info.bus = bus; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index d2552c68e94d..3736176acaab 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -520,6 +520,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "use_crs")) { pci_probe |= PCI_USE__CRS; return NULL; + } else if (!strcmp(str, "nocrs")) { + pci_probe |= PCI_ROOT_NO_CRS; + return NULL; } else if (!strcmp(str, "earlydump")) { pci_early_dump_regs = 1; return NULL; -- cgit v1.2.2 From ff7fbc72e0c3ef7e94a27a3a918fd09ec9a30204 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 22 Feb 2010 14:51:33 -0800 Subject: x86, ptrace: Simplify xstateregs_get() 48 bytes (bytes 464..511) of the xstateregs payload come from the kernel defined structure (xstate_fx_sw_bytes). Rest comes from the xstate regs structure in the thread struct. Instead of having multiple user_regset_copyout()'s, simplify the xstateregs_get() by first copying the SW bytes into the xstate regs structure in the thread structure and then using one user_regset_copyout() to copyout the xstateregs. Requested-by: Roland McGrath Signed-off-by: Suresh Siddha LKML-Reference: <20100222225240.494688491@sbs-t61.sc.intel.com> Acked-by: Roland McGrath Signed-off-by: H. Peter Anvin Cc: Oleg Nesterov --- arch/x86/kernel/i387.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 7a8a193b5144..81e23bf12c12 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -243,34 +243,18 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, return ret; /* - * First copy the fxsave bytes 0..463. + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->xsave, 0, - offsetof(struct user_xstateregs, - i387.xstate_fx_sw)); - if (ret) - return ret; - - /* - * Copy the 48bytes defined by software. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - xstate_fx_sw_bytes, - offsetof(struct user_xstateregs, - i387.xstate_fx_sw), - offsetof(struct user_xstateregs, - xsave_hdr)); - if (ret) - return ret; + memcpy(&target->thread.xstate->fxsave.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); /* - * Copy the rest of xstate memory layout. + * Copy the xstate memory layout. */ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->xsave.xsave_hdr, - offsetof(struct user_xstateregs, - xsave_hdr), -1); + &target->thread.xstate->xsave, 0, -1); return ret; } -- cgit v1.2.2 From 6dbbe14f21368a45aedba7eab0221857b8ad8d16 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 22 Feb 2010 14:51:34 -0800 Subject: x86, ptrace: Remove set_stopped_child_used_math() in [x]fpregs_set init_fpu() already ensures that the used_math() is set for the stopped child. Remove the redundant set_stopped_child_used_math() in [x]fpregs_set() Reported-by: Oleg Nesterov Signed-off-by: Suresh Siddha LKML-Reference: <20100222225240.642169080@sbs-t61.sc.intel.com> Acked-by: Rolan McGrath Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i387.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 81e23bf12c12..c01a2b846d47 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -209,8 +209,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fxsave, 0, -1); @@ -471,8 +469,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); -- cgit v1.2.2 From 28a3c93d11212655ce0a9be977c405c703844164 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Feb 2010 02:03:31 -0800 Subject: x86, pic: Fix section mismatch in legacy pic Move legacy_pic chip dummy functions out of init section as they might be referenced at run time. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D3AA@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i8259.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 9bac6817456f..fb725ee15f55 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -371,9 +371,9 @@ static void init_8259A(int auto_eoi) * platforms, such as x86 MID. */ -static void __init legacy_pic_noop(void) { }; -static void __init legacy_pic_uint_noop(unsigned int unused) { }; -static void __init legacy_pic_int_noop(int unused) { }; +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; static struct irq_chip dummy_pic_chip = { .name = "dummy pic", -- cgit v1.2.2 From 05ddafb17ad1a73c8bc333cb328bad46513e85e7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 23 Sep 2009 07:20:23 -0700 Subject: x86, ioapic: Early enable ioapic for timer irq Moorestown platform needs apic ready early for the system timer irq which is delievered via ioapic. Should not impact other platforms. In the longer term, once ioapic setup is moved before late time init, we will not need this patch to do early apic enabling. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D07@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_apic.h | 1 + arch/x86/kernel/apic/io_apic.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 31dfb42d8649..99a416ed16b7 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -187,6 +187,7 @@ extern struct mp_ioapic_gsi mp_gsi_routing[]; int mp_find_ioapic(int gsi); int mp_find_ioapic_pin(int ioapic, int gsi); void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); +extern void __init pre_init_apic_IRQ0(void); #else /* !CONFIG_X86_IO_APIC */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b34854358ee6..8c848b5877a0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4289,3 +4289,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } + +/* Enable IOAPIC early just for system timer */ +void __init pre_init_apic_IRQ0(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + + printk(KERN_INFO "Early APIC setup for system timer0\n"); +#ifndef CONFIG_SMP + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); +#endif + desc = irq_to_desc_alloc_node(0, 0); + + setup_local_APIC(); + + cfg = irq_cfg(0); + add_pin_to_irq_node(cfg, 0, 0, 0); + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); +} -- cgit v1.2.2 From 4966e1affb45c5fc402969e10e979407b972a7df Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Feb 2010 10:43:58 -0800 Subject: x86, ioapic: Add dummy ioapic functions Some ioapic extern functions are used when CONFIG_X86_IO_APIC is not defined. We need the dummy functions to avoid a compile time error. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318DA07@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_apic.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 99a416ed16b7..35832a03a515 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -197,7 +197,11 @@ static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } static inline void ioapic_insert_resources(void) { } static inline void probe_nr_irqs_gsi(void) { } +static inline int mp_find_ioapic(int gsi) { return 0; } +struct io_apic_irq_attr; +static inline int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { return 0; } #endif #endif /* _ASM_X86_IO_APIC_H */ -- cgit v1.2.2 From a712ffbc199849364c46e9112b93b66de08e2c26 Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Thu, 4 Feb 2010 10:59:27 -0800 Subject: x86/PCI: Moorestown PCI support The Moorestown platform only has a few devices that actually support PCI config cycles. The rest of the devices use an in-RAM MCFG space for the purposes of device enumeration and initialization. There are a few uglies in the fake support, like BAR sizes that aren't a power of two, sizing detection, and writes to the real devices, but other than that it's pretty straightforward. Another way to think of this is not really as PCI at all, but just a table in RAM describing which devices are present, their capabilities and their offsets in MMIO space. This could have been done with a special new firmware table on this platform, but given that we do have some real PCI devices too, simply describing things in an MCFG type space was pretty simple. Signed-off-by: Jesse Barnes LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D08@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/pci/Makefile | 2 +- arch/x86/pci/mrst.c | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 arch/x86/pci/mrst.c (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 39fba37f702f..4753ebc19cae 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-y += common.o early.o +obj-y += common.o early.o mrst.o obj-y += amd_bus.o obj-$(CONFIG_X86_64) += bus_numa.o diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c new file mode 100644 index 000000000000..6e9e1a35a5d7 --- /dev/null +++ b/arch/x86/pci/mrst.c @@ -0,0 +1,258 @@ +/* + * Moorestown PCI support + * Copyright (c) 2008 Intel Corporation + * Jesse Barnes + * + * Moorestown has an interesting PCI implementation: + * - configuration space is memory mapped (as defined by MCFG) + * - Lincroft devices also have a real, type 1 configuration space + * - Early Lincroft silicon has a type 1 access bug that will cause + * a hang if non-existent devices are accessed + * - some devices have the "fixed BAR" capability, which means + * they can't be relocated or modified; check for that during + * BAR sizing + * + * So, we use the MCFG space for all reads and writes, but also send + * Lincroft writes to type 1 space. But only read/write if the device + * actually exists, otherwise return all 1s for reads and bit bucket + * the writes. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define PCIE_CAP_OFFSET 0x100 + +/* Fixed BAR fields */ +#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */ +#define PCI_FIXED_BAR_0_SIZE 0x04 +#define PCI_FIXED_BAR_1_SIZE 0x08 +#define PCI_FIXED_BAR_2_SIZE 0x0c +#define PCI_FIXED_BAR_3_SIZE 0x10 +#define PCI_FIXED_BAR_4_SIZE 0x14 +#define PCI_FIXED_BAR_5_SIZE 0x1c + +/** + * fixed_bar_cap - return the offset of the fixed BAR cap if found + * @bus: PCI bus + * @devfn: device in question + * + * Look for the fixed BAR cap on @bus and @devfn, returning its offset + * if found or 0 otherwise. + */ +static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) +{ + int pos; + u32 pcie_cap = 0, cap_data; + + pos = PCIE_CAP_OFFSET; + while (pos) { + if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, pos, 4, &pcie_cap)) + return 0; + + if (pcie_cap == 0xffffffff) + return 0; + + if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) { + raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, pos + 4, 4, &cap_data); + if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR) + return pos; + } + + pos = pcie_cap >> 20; + } + + return 0; +} + +static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, + int reg, int len, u32 val, int offset) +{ + u32 size; + unsigned int domain, busnum; + int bar = (reg - PCI_BASE_ADDRESS_0) >> 2; + + domain = pci_domain_nr(bus); + busnum = bus->number; + + if (val == ~0 && len == 4) { + unsigned long decode; + + raw_pci_ext_ops->read(domain, busnum, devfn, + offset + 8 + (bar * 4), 4, &size); + + /* Turn the size into a decode pattern for the sizing code */ + if (size) { + decode = size - 1; + decode |= decode >> 1; + decode |= decode >> 2; + decode |= decode >> 4; + decode |= decode >> 8; + decode |= decode >> 16; + decode++; + decode = ~(decode - 1); + } else { + decode = ~0; + } + + /* + * If val is all ones, the core code is trying to size the reg, + * so update the mmconfig space with the real size. + * + * Note: this assumes the fixed size we got is a power of two. + */ + return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4, + decode); + } + + /* This is some other kind of BAR write, so just do it. */ + return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val); +} + +/** + * type1_access_ok - check whether to use type 1 + * @bus: bus number + * @devfn: device & function in question + * + * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at + * all, the we can go ahead with any reads & writes. If it's on a Lincroft, + * but doesn't exist, avoid the access altogether to keep the chip from + * hanging. + */ +static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) +{ + /* This is a workaround for A0 LNC bug where PCI status register does + * not have new CAP bit set. can not be written by SW either. + * + * PCI header type in real LNC indicates a single function device, this + * will prevent probing other devices under the same function in PCI + * shim. Therefore, use the header type in shim instead. + */ + if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) + return 0; + if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0))) + return 1; + return 0; /* langwell on others */ +} + +static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 *value) +{ + if (type1_access_ok(bus->number, devfn, where)) + return pci_direct_conf1.read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); + return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); +} + +static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 value) +{ + int offset; + + /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read + * to ROM BAR return 0 then being ignored. + */ + if (where == PCI_ROM_ADDRESS) + return 0; + + /* + * Devices with fixed BARs need special handling: + * - BAR sizing code will save, write ~0, read size, restore + * - so writes to fixed BARs need special handling + * - other writes to fixed BAR devices should go through mmconfig + */ + offset = fixed_bar_cap(bus, devfn); + if (offset && + (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) { + return pci_device_update_fixed(bus, devfn, where, size, value, + offset); + } + + /* + * On Moorestown update both real & mmconfig space + * Note: early Lincroft silicon can't handle type 1 accesses to + * non-existent devices, so just eat the write in that case. + */ + if (type1_access_ok(bus->number, devfn, where)) + return pci_direct_conf1.write(pci_domain_nr(bus), bus->number, + devfn, where, size, value); + return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn, + where, size, value); +} + +static int mrst_pci_irq_enable(struct pci_dev *dev) +{ + u8 pin; + struct io_apic_irq_attr irq_attr; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + + /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to + * IOAPIC RTE entries, so we just enable RTE for the device. + */ + irq_attr.ioapic = mp_find_ioapic(dev->irq); + irq_attr.ioapic_pin = dev->irq; + irq_attr.trigger = 1; /* level */ + irq_attr.polarity = 1; /* active low */ + io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); + + return 0; +} + +struct pci_ops pci_mrst_ops = { + .read = pci_read, + .write = pci_write, +}; + +/** + * pci_mrst_init - installs pci_mrst_ops + * + * Moorestown has an interesting PCI implementation (see above). + * Called when the early platform detection installs it. + */ +int __init pci_mrst_init(void) +{ + printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n"); + pci_mmcfg_late_init(); + pcibios_enable_irq = mrst_pci_irq_enable; + pci_root_ops = pci_mrst_ops; + /* Continue with standard init */ + return 1; +} + +/* + * Langwell devices reside at fixed offsets, don't try to move them. + */ +static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev) +{ + unsigned long offset; + u32 size; + int i; + + /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ + offset = fixed_bar_cap(dev->bus, dev->devfn); + if (!offset || PCI_DEVFN(2, 0) == dev->devfn || + PCI_DEVFN(2, 2) == dev->devfn) + return; + + for (i = 0; i < PCI_ROM_RESOURCE; i++) { + pci_read_config_dword(dev, offset + 8 + (i * 4), &size); + dev->resource[i].end = dev->resource[i].start + size - 1; + dev->resource[i].flags |= IORESOURCE_PCI_FIXED; + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup); -- cgit v1.2.2 From 5b78b6724a405d4b649db53f7aac28c930c89640 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 12 Feb 2010 02:29:11 -0800 Subject: x86, mrst: Add dummy legacy pic to platform setup Moorestown has no legacy PIC; point it to the null legacy PIC. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D09@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/mrst.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 3b7078abc871..bdf9b17290c6 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -12,6 +12,9 @@ #include #include +#include +#include +#include /* * Moorestown specific x86_init function overrides and early setup @@ -21,4 +24,6 @@ void __init x86_mrst_early_setup(void) { x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + + legacy_pic = &null_legacy_pic; } -- cgit v1.2.2 From af2730f6eefce24c4ef1dc3f8267d33626db81bc Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 12 Feb 2010 10:31:47 -0800 Subject: x86, mrst: Fill in PCI functions in x86_init layer This patch added Moorestown platform specific PCI init functions. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0A@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mrst.h | 15 +++++++++++++++ arch/x86/kernel/mrst.c | 3 +++ 2 files changed, 18 insertions(+) create mode 100644 arch/x86/include/asm/mrst.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h new file mode 100644 index 000000000000..57a177a8e823 --- /dev/null +++ b/arch/x86/include/asm/mrst.h @@ -0,0 +1,15 @@ +/* + * mrst.h: Intel Moorestown platform specific setup code + * + * (C) Copyright 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _ASM_X86_MRST_H +#define _ASM_X86_MRST_H +extern int pci_mrst_init(void); + +#endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index bdf9b17290c6..98440e18919b 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -25,5 +25,8 @@ void __init x86_mrst_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + x86_init.pci.init = pci_mrst_init; + x86_init.pci.fixup_irqs = x86_init_noop; + legacy_pic = &null_legacy_pic; } -- cgit v1.2.2 From 16ab5395856d8953ae3d81e81bd6a8c269a1bfd6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 12 Feb 2010 03:08:30 -0800 Subject: x86, mrst: Add platform timer info parsing code Moorestown platform timer information is obtained from SFI FW tables. This patch parses SFI table then assign the irq information to mp_irqs. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0B@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mrst.h | 2 + arch/x86/kernel/mrst.c | 110 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 57a177a8e823..fa144f2dd256 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -12,4 +12,6 @@ #define _ASM_X86_MRST_H extern int pci_mrst_init(void); +#define SFI_MTMR_MAX_NUM 8 + #endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 98440e18919b..bb6e45c71dde 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -10,12 +10,122 @@ * of the License. */ #include +#include +#include +#include #include +#include +#include +#include +#include #include #include #include +static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; +static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +int sfi_mtimer_num; + +static inline void assign_to_mp_irq(struct mpc_intsrc *m, + struct mpc_intsrc *mp_irq) +{ + memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static void save_mp_irq(struct mpc_intsrc *m) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +/* parse all the mtimer info to a static mtimer array */ +static int __init sfi_parse_mtmr(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_timer_table_entry *pentry; + struct mpc_intsrc mp_irq; + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mtimer_num) { + sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_timer_table_entry); + pentry = (struct sfi_timer_table_entry *) sb->pentry; + totallen = sfi_mtimer_num * sizeof(*pentry); + memcpy(sfi_mtimer_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); + pentry = sfi_mtimer_array; + for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { + printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," + " irq = %d\n", totallen, (u32)pentry->phys_addr, + pentry->freq_hz, pentry->irq); + if (!pentry->irq) + continue; + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; +/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + + return 0; +} + +struct sfi_timer_table_entry *sfi_get_mtmr(int hint) +{ + int i; + if (hint < sfi_mtimer_num) { + if (!sfi_mtimer_usage[hint]) { + pr_debug("hint taken for timer %d irq %d\n",\ + hint, sfi_mtimer_array[hint].irq); + sfi_mtimer_usage[hint] = 1; + return &sfi_mtimer_array[hint]; + } + } + /* take the first timer available */ + for (i = 0; i < sfi_mtimer_num;) { + if (!sfi_mtimer_usage[i]) { + sfi_mtimer_usage[i] = 1; + return &sfi_mtimer_array[i]; + } + i++; + } + return NULL; +} + +void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) +{ + int i; + for (i = 0; i < sfi_mtimer_num;) { + if (mtmr->irq == sfi_mtimer_array[i].irq) { + sfi_mtimer_usage[i] = 0; + return; + } + i++; + } +} + /* * Moorestown specific x86_init function overrides and early setup * calls. -- cgit v1.2.2 From cf089455966e21aeb8e4cd2669e0c1885667b04e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 12 Feb 2010 03:37:38 -0800 Subject: x86, mrst: Add vrtc platform data setup code vRTC information is obtained from SFI tables on Moorestown, this patch parses these tables and assign the information. Signed-off-by: Feng Tang LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0D@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mrst.h | 2 ++ arch/x86/kernel/mrst.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index fa144f2dd256..451d30e7f62d 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -11,7 +11,9 @@ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H extern int pci_mrst_init(void); +int __init sfi_parse_mrtc(struct sfi_table_header *table); #define SFI_MTMR_MAX_NUM 8 +#define SFI_MRTC_MAX 8 #endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index bb6e45c71dde..b7fa049c826c 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,10 @@ static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; int sfi_mtimer_num; +struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; +EXPORT_SYMBOL_GPL(sfi_mrtc_array); +int sfi_mrtc_num; + static inline void assign_to_mp_irq(struct mpc_intsrc *m, struct mpc_intsrc *mp_irq) { @@ -126,6 +131,46 @@ void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) } } +/* parse all the mrtc info to a global mrtc array */ +int __init sfi_parse_mrtc(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_rtc_table_entry *pentry; + struct mpc_intsrc mp_irq; + + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mrtc_num) { + sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_rtc_table_entry); + pentry = (struct sfi_rtc_table_entry *)sb->pentry; + totallen = sfi_mrtc_num * sizeof(*pentry); + memcpy(sfi_mrtc_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); + pentry = sfi_mrtc_array; + for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { + printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", + totallen, (u32)pentry->phys_addr, pentry->irq); + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = 0; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + return 0; +} + +void __init mrst_rtc_init(void) +{ + sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); +} + /* * Moorestown specific x86_init function overrides and early setup * calls. -- cgit v1.2.2 From bb24c4716185f6e116c440462c65c1f56649183b Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 2 Sep 2009 07:37:17 -0700 Subject: x86, apbt: Moorestown APB system timer driver Moorestown platform does not have PIT or HPET platform timers. Instead it has a bank of eight APB timers. The number of available timers to the os is exposed via SFI mtmr tables. All APB timer interrupts are routed via ioapic rtes and delivered as MSI. Currently, we use timer 0 and 1 for per cpu clockevent devices, timer 2 for clocksource. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D2D2@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 11 + arch/x86/include/asm/apb_timer.h | 70 ++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/apb_timer.c | 780 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 862 insertions(+) create mode 100644 arch/x86/include/asm/apb_timer.h create mode 100644 arch/x86/kernel/apb_timer.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb4092568f9e..0ab2dcef7d84 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -390,6 +390,7 @@ config X86_MRST bool "Moorestown MID platform" depends on X86_32 depends on X86_EXTENDED_PLATFORM + select APB_TIMER ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -612,6 +613,16 @@ config HPET_EMULATE_RTC def_bool y depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) +config APB_TIMER + def_bool y if MRST + prompt "Langwell APB Timer Support" if X86_MRST + help + APB timer is the replacement for 8254, HPET on X86 MID platforms. + The APBT provides a stable time base on SMP + systems, unlike the TSC, but it is more expensive to access, + as it is off-chip. APB timers are always running regardless of CPU + C states, they are used as per CPU clockevent device when possible. + # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. config DMI diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h new file mode 100644 index 000000000000..c74a2eebe570 --- /dev/null +++ b/arch/x86/include/asm/apb_timer.h @@ -0,0 +1,70 @@ +/* + * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + */ + +#ifndef ASM_X86_APBT_H +#define ASM_X86_APBT_H +#include + +#ifdef CONFIG_APB_TIMER + +/* Langwell DW APB timer registers */ +#define APBTMR_N_LOAD_COUNT 0x00 +#define APBTMR_N_CURRENT_VALUE 0x04 +#define APBTMR_N_CONTROL 0x08 +#define APBTMR_N_EOI 0x0c +#define APBTMR_N_INT_STATUS 0x10 + +#define APBTMRS_INT_STATUS 0xa0 +#define APBTMRS_EOI 0xa4 +#define APBTMRS_RAW_INT_STATUS 0xa8 +#define APBTMRS_COMP_VERSION 0xac +#define APBTMRS_REG_SIZE 0x14 + +/* register bits */ +#define APBTMR_CONTROL_ENABLE (1<<0) +#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */ +#define APBTMR_CONTROL_INT (1<<2) + +/* default memory mapped register base */ +#define LNW_SCU_ADDR 0xFF100000 +#define LNW_EXT_TIMER_OFFSET 0x1B800 +#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET) +#define LNW_EXT_TIMER_PGOFFSET 0x800 + +/* APBT clock speed range from PCLK to fabric base, 25-100MHz */ +#define APBT_MAX_FREQ 50 +#define APBT_MIN_FREQ 1 +#define APBT_MMAP_SIZE 1024 + +#define APBT_DEV_USED 1 + +extern void apbt_time_init(void); +extern struct clock_event_device *global_clock_event; +extern unsigned long apbt_quick_calibrate(void); +extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); +extern void apbt_setup_secondary_clock(void); +extern unsigned int boot_cpu_id; +extern int disable_apbt_percpu; + +extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); +extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); +extern int sfi_mtimer_num; + +#else /* CONFIG_APB_TIMER */ + +static inline unsigned long apbt_quick_calibrate(void) {return 0; } +static inline void apbt_time_init(void) {return 0; } + +#endif +#endif /* ASM_X86_APBT_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09bc5a52..4c58352209e0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c new file mode 100644 index 000000000000..83a345b0256c --- /dev/null +++ b/arch/x86/kernel/apb_timer.c @@ -0,0 +1,780 @@ +/* + * apb_timer.c: Driver for Langwell APB timers + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * Langwell is the south complex of Intel Moorestown MID platform. There are + * eight external timers in total that can be used by the operating system. + * The timer information, such as frequency and addresses, is provided to the + * OS via SFI tables. + * Timer interrupts are routed via FW/HW emulated IOAPIC independently via + * individual redirection table entries (RTE). + * Unlike HPET, there is no master counter, therefore one of the timers are + * used as clocksource. The overall allocation looks like: + * - timer 0 - NR_CPUs for per cpu timer + * - one timer for clocksource + * - one timer for watchdog driver. + * It is also worth notice that APB timer does not support true one-shot mode, + * free-running mode will be used here to emulate one-shot mode. + * APB timer can also be used as broadcast timer along with per cpu local APIC + * timer, but by default APB timer has higher rating than local APIC timers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define APBT_MASK CLOCKSOURCE_MASK(32) +#define APBT_SHIFT 22 +#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKSOURCE_RATING 250 +#define APBT_MIN_DELTA_USEC 200 + +#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) +#define APBT_CLOCKEVENT0_NUM (0) +#define APBT_CLOCKEVENT1_NUM (1) +#define APBT_CLOCKSOURCE_NUM (2) + +static unsigned long apbt_address; +static int apb_timer_block_enabled; +static void __iomem *apbt_virt_address; +static int phy_cs_timer_id; + +/* + * Common DW APB timer info + */ +static uint64_t apbt_freq; + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt); +static cycle_t apbt_read_clocksource(struct clocksource *cs); +static void apbt_restart_clocksource(void); + +struct apbt_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int tick; + unsigned int count; + unsigned int flags; + char name[10]; +}; + +int disable_apbt_percpu __cpuinitdata; + +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); +static struct apbt_dev *apbt_devs; +#endif + +static inline unsigned long apbt_readl_reg(unsigned long a) +{ + return readl(apbt_virt_address + a); +} + +static inline void apbt_writel_reg(unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a); +} + +static inline unsigned long apbt_readl(int n, unsigned long a) +{ + return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_writel(int n, unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_set_mapping(void) +{ + struct sfi_timer_table_entry *mtmr; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (unsigned long)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (apbt_virt_address) { + pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ + (void *)apbt_address, (void *)apbt_virt_address); + } else { + pr_debug("Failed mapping APBT phy address at %p\n",\ + (void *)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) + / APBTMRS_REG_SIZE; + pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + return; + +panic_noapbt: + panic("Failed to setup APB system timer\n"); + +} + +static inline void apbt_clear_mapping(void) +{ + iounmap(apbt_virt_address); + apbt_virt_address = NULL; +} + +/* + * APBT timer interrupt enable / disable + */ +static inline int is_apbt_capable(void) +{ + return apbt_virt_address ? 1 : 0; +} + +static struct clocksource clocksource_apbt = { + .name = "apbt", + .rating = APBT_CLOCKSOURCE_RATING, + .read = apbt_read_clocksource, + .mask = APBT_MASK, + .shift = APBT_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = apbt_restart_clocksource, +}; + +/* boot APB clock event device */ +static struct clock_event_device apbt_clockevent = { + .name = "apbt0", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = apbt_set_mode, + .set_next_event = apbt_next_event, + .shift = APBT_SHIFT, + .irq = 0, + .rating = APBT_CLOCKEVENT_RATING, +}; + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + disable_apbt_percpu = 0; + else if (strcmp("lapic_and_apbt", arg) == 0) + disable_apbt_percpu = 1; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); + +/* + * start count down from 0xffff_ffff. this is done by toggling the enable bit + * then load initial load count to ~0. + */ +static void apbt_start_counter(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); + /* enable, mask interrupt */ + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + /* read it once to get cached counter value initialized */ + apbt_read_clocksource(&clocksource_apbt); +} + +static irqreturn_t apbt_interrupt_handler(int irq, void *data) +{ + struct apbt_dev *dev = (struct apbt_dev *)data; + struct clock_event_device *aevt = &dev->evt; + + if (!aevt->event_handler) { + printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", + dev->num); + return IRQ_NONE; + } + aevt->event_handler(aevt); + return IRQ_HANDLED; +} + +static void apbt_restart_clocksource(void) +{ + apbt_start_counter(phy_cs_timer_id); +} + +/* Setup IRQ routing via IOAPIC */ +#ifdef CONFIG_SMP +static void apbt_setup_irq(struct apbt_dev *adev) +{ + struct irq_chip *chip; + struct irq_desc *desc; + + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + desc = irq_to_desc(adev->irq); + chip = get_irq_chip(adev->irq); + disable_irq(adev->irq); + desc->status |= IRQ_MOVE_PCNTXT; + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ + set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); + enable_irq(adev->irq); + if (system_state == SYSTEM_BOOTING) + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } +} +#endif + +static void apbt_enable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + /* clear pending intr */ + apbt_readl(n, APBTMR_N_EOI); + ctrl &= ~APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + +static void apbt_disable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl |= APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + + +static int __init apbt_clockevent_register(void) +{ + struct sfi_timer_table_entry *mtmr; + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz + , NSEC_PER_SEC, APBT_SHIFT); + + /* Calculate the min / max delta */ + apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &apbt_clockevent); + apbt_clockevent.min_delta_ns = clockevent_delta2ns( + APBT_MIN_DELTA_USEC*apbt_freq, + &apbt_clockevent); + /* + * Start apbt with the boot cpu mask and make it + * global if not used for per cpu timer. + */ + apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + + if (disable_apbt_percpu) { + apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + global_clock_event = &apbt_clockevent; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + apbt_clockevent.name, &apbt_clockevent)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + apbt_clockevent.irq); + } + + clockevents_register_device(&apbt_clockevent); + /* Start APBT 0 interrupts */ + apbt_enable_int(APBT_CLOCKEVENT0_NUM); + + sfi_free_mtmr(mtmr); + return 0; +} + +#ifdef CONFIG_SMP +/* Should be called with per cpu */ +void apbt_setup_secondary_clock(void) +{ + struct apbt_dev *adev; + struct clock_event_device *aevt; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (cpu == boot_cpu_id) + return; + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); + adev = &per_cpu(cpu_apbt_dev, cpu); + aevt = &adev->evt; + + memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); + aevt->cpumask = cpumask_of(cpu); + aevt->name = adev->name; + aevt->mode = CLOCK_EVT_MODE_UNUSED; + + printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", + cpu, aevt->name, *(u32 *)aevt->cpumask); + + apbt_setup_irq(adev); + + clockevents_register_device(aevt); + + apbt_enable_int(cpu); + + return; +} + +/* + * this notify handler process CPU hotplug events. in case of S0i3, nonboot + * cpus are disabled/enabled frequently, for performance reasons, we keep the + * per cpu timer irq registered so that we do need to do free_irq/request_irq. + * + * TODO: it might be more reliable to directly disable percpu clockevent device + * without the notifier chain. currently, cpu 0 may get interrupts from other + * cpu timers during the offline process due to the ordering of notification. + * the extra interrupt is harmless. + */ +static int apbt_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + apbt_disable_int(cpu); + if (system_state == SYSTEM_RUNNING) + pr_debug("skipping APBT CPU %lu offline\n", cpu); + else if (adev) { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + free_irq(adev->irq, adev); + } + break; + default: + pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; +} + +static __init int apbt_late_init(void) +{ + if (disable_apbt_percpu) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; +} +fs_initcall(apbt_late_init); +#else + +void apbt_setup_secondary_clock(void) {} + +#endif /* CONFIG_SMP */ + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long ctrl; + uint64_t delta; + int timer_num; + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + pr_debug("%s CPU %d timer %d mode=%d\n", + __func__, first_cpu(*evt->cpumask), timer_num, mode); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; + delta >>= apbt_clockevent.shift; + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_MODE_PERIODIC; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* + * DW APB p. 46, have to disable timer before load counter, + * may cause sync problem. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + udelay(1); + pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + /* APB timer does not have one-shot mode, use free running mode */ + case CLOCK_EVT_MODE_ONESHOT: + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + /* + * set free running mode, this mode will let timer reload max + * timeout which will give time (3min on 25MHz clock) to rearm + * the next event, therefore emulate the one-shot mode. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write again to set free running mode */ + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + + /* + * DW APB p. 46, load counter with all 1s before starting free + * running mode. + */ + apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); + ctrl &= ~APBTMR_CONTROL_INT; + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + apbt_disable_int(timer_num); + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_RESUME: + apbt_enable_int(timer_num); + break; + } +} + +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long ctrl; + int timer_num; + + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + /* Disable timer */ + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write new count */ + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + return 0; +} + +/* + * APB timer clock is not in sync with pclk on Langwell, which translates to + * unreliable read value caused by sampling error. the error does not add up + * overtime and only happens when sampling a 0 as a 1 by mistake. so the time + * would go backwards. the following code is trying to prevent time traveling + * backwards. little bit paranoid. + */ +static cycle_t apbt_read_clocksource(struct clocksource *cs) +{ + unsigned long t0, t1, t2; + static unsigned long last_read; + +bad_count: + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if (unlikely(t1 < t2)) { + pr_debug("APBT: read current count error %lx:%lx:%lx\n", + t1, t2, t2 - t1); + goto bad_count; + } + /* + * check against cached last read, makes sure time does not go back. + * it could be a normal rollover but we will do tripple check anyway + */ + if (unlikely(t2 > last_read)) { + /* check if we have a normal rollover */ + unsigned long raw_intr_status = + apbt_readl_reg(APBTMRS_RAW_INT_STATUS); + /* + * cs timer interrupt is masked but raw intr bit is set if + * rollover occurs. then we read EOI reg to clear it. + */ + if (raw_intr_status & (1 << phy_cs_timer_id)) { + apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); + goto out; + } + pr_debug("APB CS going back %lx:%lx:%lx ", + t2, last_read, t2 - last_read); +bad_count_x3: + pr_debug(KERN_INFO "tripple check enforced\n"); + t0 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if ((t2 > t1) || (t1 > t0)) { + printk(KERN_ERR "Error: APB CS tripple check failed\n"); + goto bad_count_x3; + } + } +out: + last_read = t2; + return (cycle_t)~t2; +} + +static int apbt_clocksource_register(void) +{ + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + apbt_start_counter(phy_cs_timer_id); + + /* Verify whether apbt counter works */ + t1 = apbt_read_clocksource(&clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == apbt_read_clocksource(&clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + /* + * initialize and register APBT clocksource + * convert that to ns/clock cycle + * mult = (ns/c) * 2^APBT_SHIFT + */ + clocksource_apbt.mult = div_sc(MSEC_PER_SEC, + (unsigned long) apbt_freq, APBT_SHIFT); + clocksource_register(&clocksource_apbt); + + return 0; +} + +/* + * Early setup the APBT timer, only use timer 0 for booting then switch to + * per CPU timer if possible. + * returns 1 if per cpu apbt is setup + * returns 0 if no per cpu apbt is chosen + * panic if set up failed, this is the only platform timer on Moorestown. + */ +void __init apbt_time_init(void) +{ +#ifdef CONFIG_SMP + int i; + struct sfi_timer_table_entry *p_mtmr; + unsigned int percpu_timer; + struct apbt_dev *adev; +#endif + + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (apbt_virt_address) { + pr_debug("Found APBT version 0x%lx\n",\ + apbt_readl_reg(APBTMRS_COMP_VERSION)); + } else + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } +#ifdef CONFIG_SMP + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (disable_apbt_percpu) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) { + percpu_timer = 1; + apbt_num_timers_used = num_possible_cpus(); + } else { + percpu_timer = 0; + apbt_num_timers_used = 1; + adev = &per_cpu(cpu_apbt_dev, 0); + adev->flags &= ~APBT_DEV_USED; + } + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, + GFP_KERNEL); + if (!apbt_devs) { + printk(KERN_ERR "Failed to allocate APB timer devices\n"); + return; + } + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) { + adev->tick = p_mtmr->freq_hz; + adev->irq = p_mtmr->irq; + } else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + adev->count = 0; + sprintf(adev->name, "apbt%d", i); + } +#endif + + return; + +out_noapbt: + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); +} + +static inline void apbt_disable(int n) +{ + if (is_apbt_capable()) { + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + } +} + +/* called before apb_timer_enable, use early map */ +unsigned long apbt_quick_calibrate() +{ + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + apbt_start_counter(phy_cs_timer_id); + + /* check if the timer can count down, otherwise return */ + old = apbt_read_clocksource(&clocksource_apbt); + i = 10000; + while (--i) { + if (old != apbt_read_clocksource(&clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq * 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + apbt_start_counter(phy_cs_timer_id); + + old = apbt_read_clocksource(&clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = apbt_read_clocksource(&clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * apbt_freq * 1000) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; +failed: + return 0; +} -- cgit v1.2.2 From 3746c6b6e26b8ad605f11b43e54acb3481d40980 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 12 Feb 2010 05:01:12 -0800 Subject: x86, mrst: Platform clock setup code Add Moorestown platform clock setup code to the x86_init abstraction. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D2D4@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/mrst.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index b7fa049c826c..0aad8670858e 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -23,6 +23,7 @@ #include #include #include +#include static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; @@ -166,11 +167,55 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) return 0; } +/* + * the secondary clock in Moorestown can be APBT or LAPIC clock, default to + * APBT but cmdline option can also override it. + */ +static void __cpuinit mrst_setup_secondary_clock(void) +{ + /* restore default lapic clock if disabled by cmdline */ + if (disable_apbt_percpu) + return setup_secondary_APIC_clock(); + apbt_setup_secondary_clock(); +} + +static unsigned long __init mrst_calibrate_tsc(void) +{ + unsigned long flags, fast_calibrate; + + local_irq_save(flags); + fast_calibrate = apbt_quick_calibrate(); + local_irq_restore(flags); + + if (fast_calibrate) + return fast_calibrate; + + return 0; +} + +void __init mrst_time_init(void) +{ + sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + pre_init_apic_IRQ0(); + apbt_time_init(); +} + void __init mrst_rtc_init(void) { sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } +/* + * if we use per cpu apb timer, the bootclock already setup. if we use lapic + * timer and one apbt timer for broadcast, we need to set up lapic boot clock. + */ +static void __init mrst_setup_boot_clock(void) +{ + pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); + if (disable_apbt_percpu) + setup_boot_APIC_clock(); +}; + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -180,6 +225,14 @@ void __init x86_mrst_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + x86_init.timers.timer_init = mrst_time_init; + x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + + x86_init.irqs.pre_vector_init = x86_init_noop; + + x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + + x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; -- cgit v1.2.2 From 28c6a0ba30457380b140d9d7a61530eda8969180 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 23 Feb 2010 20:27:48 -0800 Subject: x86, legacy_irq: Remove left over nr_legacy_irqs nr_legacy_irqs and its ilk have moved to legacy_pic. -v2: there is one in ioapic_.c Singed-off-by: Yinghai Lu LKML-Reference: <4B84AAC4.2020204@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq.h | 1 - arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/irqinit.c | 7 ++----- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 262292729fc4..5458380b6ef8 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -48,6 +48,5 @@ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); -extern int nr_legacy_irqs; #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8c848b5877a0..b9d08f0e1e33 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1440,7 +1440,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq * controllers like 8259. Now that IO-APIC can handle this irq, update * the cfg->domain. */ - if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) apic->vector_allocation_domain(0, cfg->domain); if (assign_irq_vector(irq, cfg, apic->target_cpus())) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d2f787b3de56..ef257fc2921b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -99,9 +99,6 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Number of legacy interrupts */ -int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; - void __init init_ISA_irqs(void) { int i; @@ -114,7 +111,7 @@ void __init init_ISA_irqs(void) /* * 16 old-style INTA-cycle interrupts: */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -138,7 +135,7 @@ void __init init_IRQ(void) * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ - for (i = 0; i < nr_legacy_irqs; i++) + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; x86_init.irqs.intr_init(); -- cgit v1.2.2 From 9eeeb09edba1e3544526611663472743ca584d36 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 23 Feb 2010 18:49:04 -0800 Subject: x86, legacy_irq: Remove duplicate vector assigment Remove duplicated cfg[i].vector assignment. Signed-off-by: Yinghai Lu LKML-Reference: <4B8493A0.6080501@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b9d08f0e1e33..b758d49b811c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -161,8 +161,6 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { - if (i < legacy_pic->nr_legacy_irqs) - cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); -- cgit v1.2.2 From c54113823c777f035fa7444f8841fbccda4a5cc0 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 24 Feb 2010 09:42:50 -0800 Subject: x86, pci: Add sanity check for PCI fixed bar probing While probing for the PCI fixed BAR capability in the extended PCI configuration space we need to make sure raw_pci_ext_ops is actually initialized. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A321E8F7@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/pci/mrst.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 6e9e1a35a5d7..8bf2fcb88d04 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -57,6 +57,10 @@ static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) u32 pcie_cap = 0, cap_data; pos = PCIE_CAP_OFFSET; + + if (!raw_pci_ext_ops) + return 0; + while (pos) { if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, devfn, pos, 4, &pcie_cap)) -- cgit v1.2.2 From e808bae2407a087bfd40200a27587898e5a9909d Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 9 Feb 2010 21:38:45 -0200 Subject: x86: Do not reserve brk for DMI if it's not going to be used This will save 64K bytes from memory when loading linux if DMI is disabled, which is good for embedded systems. Signed-off-by: Thadeu Lima de Souza Cascardo LKML-Reference: <1265758732-19320-1-git-send-email-cascardo@holoscopio.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3499b4fabc94..cb42109a55b4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -121,7 +121,9 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +#ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); +#endif unsigned int boot_cpu_id __read_mostly; -- cgit v1.2.2 From 14315592009c17035cac81f4954d5a1f4d71e489 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 17 Feb 2010 10:38:10 +0000 Subject: x86, mm: Allow highmem user page tables to be disabled at boot time Distros generally (I looked at Debian, RHEL5 and SLES11) seem to enable CONFIG_HIGHPTE for any x86 configuration which has highmem enabled. This means that the overhead applies even to machines which have a fairly modest amount of high memory and which therefore do not really benefit from allocating PTEs in high memory but still pay the price of the additional mapping operations. Running kernbench on a 4G box I found that with CONFIG_HIGHPTE=y but no actual highptes being allocated there was a reduction in system time used from 59.737s to 55.9s. With CONFIG_HIGHPTE=y and highmem PTEs being allocated: Average Optimal load -j 4 Run (std deviation): Elapsed Time 175.396 (0.238914) User Time 515.983 (5.85019) System Time 59.737 (1.26727) Percent CPU 263.8 (71.6796) Context Switches 39989.7 (4672.64) Sleeps 42617.7 (246.307) With CONFIG_HIGHPTE=y but with no highmem PTEs being allocated: Average Optimal load -j 4 Run (std deviation): Elapsed Time 174.278 (0.831968) User Time 515.659 (6.07012) System Time 55.9 (1.07799) Percent CPU 263.8 (71.266) Context Switches 39929.6 (4485.13) Sleeps 42583.7 (373.039) This patch allows the user to control the allocation of PTEs in highmem from the command line ("userpte=nohigh") but retains the status-quo as the default. It is possible that some simple heuristic could be developed which allows auto-tuning of this option however I don't have a sufficiently large machine available to me to perform any particularly meaningful experiments. We could probably handwave up an argument for a threshold at 16G of total RAM. Assuming 768M of lowmem we have 196608 potential lowmem PTE pages. Each page can map 2M of RAM in a PAE-enabled configuration, meaning a maximum of 384G of RAM could potentially be mapped using lowmem PTEs. Even allowing generous factor of 10 to account for other required lowmem allocations, generous slop to account for page sharing (which reduces the total amount of RAM mappable by a given number of PT pages) and other innacuracies in the estimations it would seem that even a 32G machine would not have a particularly pressing need for highmem PTEs. I think 32G could be considered to be at the upper bound of what might be sensible on a 32 bit machine (although I think in practice 64G is still supported). It's seems questionable if HIGHPTE is even a win for any amount of RAM you would sensibly run a 32 bit kernel on rather than going 64 bit. Signed-off-by: Ian Campbell LKML-Reference: <1266403090-20162-1-git-send-email-ian.campbell@citrix.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgalloc.h | 5 +++++ arch/x86/mm/pgtable.c | 31 ++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 0e8c2a0fd922..271de94c3810 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -22,6 +22,11 @@ static inline void paravirt_release_pmd(unsigned long pfn) {} static inline void paravirt_release_pud(unsigned long pfn) {} #endif +/* + * Flags to use when allocating a user page table page. + */ +extern gfp_t __userpte_alloc_gfp; + /* * Allocate and free page tables. */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ed34f5e35999..c9ba9deafe83 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -6,6 +6,14 @@ #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { return (pte_t *)__get_free_page(PGALLOC_GFP); @@ -15,16 +23,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); -#else - pte = alloc_pages(PGALLOC_GFP, 0); -#endif + pte = alloc_pages(__userpte_alloc_gfp, 0); if (pte) pgtable_page_ctor(pte); return pte; } +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); -- cgit v1.2.2 From 0c54dd341fb701928b8e5dca91ced1870c55b05b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 25 Feb 2010 08:42:06 -0500 Subject: ftrace: Remove memory barriers from NMI code when not needed The code in stop_machine that modifies the kernel text has a bit of logic to handle the case of NMIs. stop_machine does not prevent NMIs from executing, and if an NMI were to trigger on another CPU as the modifying CPU is changing the NMI text, a GPF could result. To prevent the GPF, the NMI calls ftrace_nmi_enter() which may modify the code first, then any other NMIs will just change the text to the same content which will do no harm. The code that stop_machine called must wait for NMIs to finish while it changes each location in the kernel. That code may also change the text to what the NMI changed it to. The key is that the text will never change content while another CPU is executing it. To make the above work, the call to ftrace_nmi_enter() must also do a smp_mb() as well as atomic_inc(). But for applications like perf that require a high number of NMIs for profiling, this can have a dramatic effect on the system. Not only is it doing a full memory barrier on both nmi_enter() as well as nmi_exit() it is also modifying a global variable with an atomic operation. This kills performance on large SMP machines. Since the memory barriers are only needed when ftrace is in the process of modifying the text (which is seldom), this patch adds a "modifying_code" variable that gets set before stop machine is executed and cleared afterwards. The NMIs will check this variable and store it in a per CPU "save_modifying_code" variable that it will use to check if it needs to do the memory barriers and atomic dec on NMI exit. Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 309689245431..605ef196fdd6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -30,14 +30,32 @@ #ifdef CONFIG_DYNAMIC_FTRACE +/* + * modifying_code is set to notify NMIs that they need to use + * memory barriers when entering or exiting. But we don't want + * to burden NMIs with unnecessary memory barriers when code + * modification is not being done (which is most of the time). + * + * A mutex is already held when ftrace_arch_code_modify_prepare + * and post_process are called. No locks need to be taken here. + * + * Stop machine will make sure currently running NMIs are done + * and new NMIs will see the updated variable before we need + * to worry about NMIs doing memory barriers. + */ +static int modifying_code __read_mostly; +static DEFINE_PER_CPU(int, save_modifying_code); + int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { + modifying_code = 0; set_kernel_text_ro(); return 0; } @@ -149,6 +167,11 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { + __get_cpu_var(save_modifying_code) = modifying_code; + + if (!__get_cpu_var(save_modifying_code)) + return; + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { smp_rmb(); ftrace_mod_code(); @@ -160,6 +183,9 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { + if (!__get_cpu_var(save_modifying_code)) + return; + /* Finish all executions before clearing nmi_running */ smp_mb(); atomic_dec(&nmi_running); -- cgit v1.2.2 From bb8d41330ce27edb91adb6922d3f8e1a8923f727 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Feb 2010 16:42:11 +0100 Subject: x86/PCI: Prevent mmconfig memory corruption commit ff097ddd4 (x86/PCI: MMCONFIG: manage pci_mmcfg_region as a list, not a table) introduced a nasty memory corruption when pci_mmcfg_list is empty. pci_mmcfg_check_end_bus_number() dereferences pci_mmcfg_list.prev even when the list is empty. The following write hits some variable near to pci_mmcfg_list. Further down a similar problem exists, where cfg->list.next is dereferenced unconditionally and a comparison with some variable near to pci_mmcfg_list happens. Add a check for the last element into the for_each_entry() loop and remove all the other crappy logic which is just a leftover of the old array based code which was replaced by the list conversion. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Bjorn Helgaas Cc: Yinghai Lu Cc: stable@kernel.org Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index b19d1e54201e..8f3f9a50b1e0 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -303,22 +303,17 @@ static void __init pci_mmcfg_check_end_bus_number(void) { struct pci_mmcfg_region *cfg, *cfgx; - /* last one*/ - cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list); - if (cfg) - if (cfg->end_bus < cfg->start_bus) - cfg->end_bus = 255; - - if (list_is_singular(&pci_mmcfg_list)) - return; - - /* don't overlap please */ + /* Fixup overlaps */ list_for_each_entry(cfg, &pci_mmcfg_list, list) { if (cfg->end_bus < cfg->start_bus) cfg->end_bus = 255; + /* Don't access the list head ! */ + if (cfg->list.next == &pci_mmcfg_list) + break; + cfgx = list_entry(cfg->list.next, typeof(*cfg), list); - if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus) + if (cfg->end_bus >= cfgx->start_bus) cfg->end_bus = cfgx->start_bus - 1; } } -- cgit v1.2.2 From d498f763950703c724c650db1d34a1c8679f9ca8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:33:49 -0500 Subject: kprobes/x86: Cleanup RELATIVEJUMP_INSTRUCTION to RELATIVEJUMP_OPCODE Change RELATIVEJUMP_INSTRUCTION macro to RELATIVEJUMP_OPCODE since it represents just the opcode byte. Signed-off-by: Masami Hiramatsu Acked-by: Mathieu Desnoyers Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133349.6725.99302.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kprobes.h | 2 +- arch/x86/kernel/kprobes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4fe681de1e76..eaec8ea7bf18 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -32,7 +32,7 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc -#define RELATIVEJUMP_INSTRUCTION 0xe9 +#define RELATIVEJUMP_OPCODE 0xe9 #define MAX_INSN_SIZE 16 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) \ diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5de9f4a9c3fd..15177cdfdd00 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -115,7 +115,7 @@ static void __kprobes set_jmp_op(void *from, void *to) } __attribute__((packed)) * jop; jop = (struct __arch_jmp_op *)from; jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; + jop->op = RELATIVEJUMP_OPCODE; } /* -- cgit v1.2.2 From 0f94eb634ef7af736dee5639aac1c2fe9635d089 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:23 -0500 Subject: kprobes/x86: Boost probes when reentering Integrate prepare_singlestep() into setup_singlestep() to boost up reenter probes, if possible. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133423.6725.12071.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 48 +++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 15177cdfdd00..c69bb65006f3 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -406,18 +406,6 @@ static void __kprobes restore_btf(void) update_debugctlmsr(current->thread.debugctlmsr); } -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -430,19 +418,38 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, } static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, int reenter) { #if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ regs->ip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return; } #endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; } /* @@ -456,11 +463,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: case KPROBE_HIT_ACTIVE: - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; + setup_singlestep(p, regs, kcb, 1); break; case KPROBE_HIT_SS: /* A probe has been hit in the codepath leading up to, or just @@ -535,13 +539,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) * more here. */ if (!p->pre_handler || !p->pre_handler(p, regs)) - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ -- cgit v1.2.2 From f007ea2685692bafb386820144cf73a14016fc7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:30 -0500 Subject: kprobes/x86: Cleanup save/restore registers Introduce SAVE/RESOTRE_REGS_STRING for cleanup kretprobe-trampoline asm code. These macros will be used for emulating interruption. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133430.6725.83342.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 128 ++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index c69bb65006f3..4ae95befd0eb 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -554,6 +554,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %ds\n" \ + " pushl %es\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -567,65 +630,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax and gs. - * trampoline_handler() will plug in these values - */ - " subl $16, %esp\n" - " pushl %fs\n" - " pushl %es\n" - " pushl %ds\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ @@ -633,15 +647,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ " movl %eax, 56(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ds, es, fs, gs, orig_ax and ip */ - " addl $24, %esp\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); -- cgit v1.2.2 From 3d55cc8a058ee96291d6d45b1e35121b9920eca3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:38 -0500 Subject: x86: Add text_poke_smp for SMP cross modifying code Add generic text_poke_smp for SMP which uses stop_machine() to synchronize modifying code. This stop_machine() method is officially described at "7.1.3 Handling Self- and Cross-Modifying Code" on the intel's software developer's manual 3A. Since stop_machine() can't protect code against NMI/MCE, this function can not modify those handlers. And also, this function is basically for modifying multibyte-single-instruction. For modifying multibyte-multi-instructions, we need another special trap & detour code. This code originaly comes from immediate values with stop_machine() version. Thanks Jason and Mathieu! Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Mathieu Desnoyers Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133438.6725.80273.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 4 ++- arch/x86/kernel/alternative.c | 60 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ac80b7d70014..643d6ab3588b 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -160,10 +160,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * More care must be taken when modifying code in the SMP case because of - * Intel's errata. + * Intel's errata. text_poke_smp() takes care that errata, but still + * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_smp(void *addr, const void *opcode, size_t len); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e63b80e5861c..c41f13c15e8f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -570,3 +571,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) local_irq_restore(flags); return addr; } + +/* + * Cross-modifying kernel text with stop_machine(). + * This code originally comes from immediate value. + */ +static atomic_t stop_machine_first; +static int wrote_text; + +struct text_poke_params { + void *addr; + const void *opcode; + size_t len; +}; + +static int __kprobes stop_machine_text_poke(void *data) +{ + struct text_poke_params *tpp = data; + + if (atomic_dec_and_test(&stop_machine_first)) { + text_poke(tpp->addr, tpp->opcode, tpp->len); + smp_wmb(); /* Make sure other cpus see that this has run */ + wrote_text = 1; + } else { + while (!wrote_text) + smp_rmb(); + sync_core(); + } + + flush_icache_range((unsigned long)tpp->addr, + (unsigned long)tpp->addr + tpp->len); + return 0; +} + +/** + * text_poke_smp - Update instructions on a live kernel on SMP + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Modify multi-byte instruction by using stop_machine() on SMP. This allows + * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying + * should be allowed, since stop_machine() does _not_ protect code against + * NMI and MCE. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) +{ + struct text_poke_params tpp; + + tpp.addr = addr; + tpp.opcode = opcode; + tpp.len = len; + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + return addr; +} + -- cgit v1.2.2 From c0f7ac3a9edde786bc129d37627953a8b8abefdf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:46 -0500 Subject: kprobes/x86: Support kprobes jump optimization on x86 Introduce x86 arch-specific optimization code, which supports both of x86-32 and x86-64. This code also supports safety checking, which decodes whole of a function in which probe is inserted, and checks following conditions before optimization: - The optimized instructions which will be replaced by a jump instruction don't straddle the function boundary. - There is no indirect jump instruction, because it will jumps into the address range which is replaced by jump operand. - There is no jump/loop instruction which jumps into the address range which is replaced by jump operand. - Don't optimize kprobes if it is in functions into which fixup code will jumps. This uses text_poke_multibyte() which doesn't support modifying code on NMI/MCE handler. However, since kprobes itself doesn't support NMI/MCE code probing, it's not a problem. Changes in v9: - Use *_text_reserved() for checking the probe can be optimized. - Verify jump address range is in 2G range when preparing slot. - Backup original code when switching optimized buffer, instead of preparing buffer, because there can be int3 of other probes in preparing phase. - Check kprobe is disabled in arch_check_optimized_kprobe(). - Strictly check indirect jump opcodes (ff /4, ff /5). Changes in v6: - Split stop_machine-based jump patching code. - Update comments and coding style. Changes in v5: - Introduce stop_machine-based jump replacing. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133446.6725.78994.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/kprobes.h | 29 +++ arch/x86/kernel/kprobes.c | 433 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 441 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cbcbfdee3ee0..e6f5a98d5157 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS select HAVE_KRETPROBES + select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index eaec8ea7bf18..4ffa345a8ccb 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -33,6 +33,9 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc #define RELATIVEJUMP_OPCODE 0xe9 +#define RELATIVEJUMP_SIZE 5 +#define RELATIVECALL_OPCODE 0xe8 +#define RELATIVE_ADDR_SIZE 4 #define MAX_INSN_SIZE 16 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) \ @@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) +/* optinsn template addresses */ +extern kprobe_opcode_t optprobe_template_entry; +extern kprobe_opcode_t optprobe_template_val; +extern kprobe_opcode_t optprobe_template_call; +extern kprobe_opcode_t optprobe_template_end; +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTINSN_SIZE \ + (((unsigned long)&optprobe_template_end - \ + (unsigned long)&optprobe_template_entry) + \ + MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) + extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); @@ -64,6 +78,21 @@ struct arch_specific_insn { int boostable; }; +struct arch_optimized_insn { + /* copy of the original instructions */ + kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; + /* detour code buffer */ + kprobe_opcode_t *insn; + /* the size of instructions copied to detour code buffer */ + size_t size; +}; + +/* Return true (!0) if optinsn is prepared for optimization. */ +static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) +{ + return optinsn->size; +} + struct prev_kprobe { struct kprobe *kp; unsigned long status; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 4ae95befd0eb..b43bbaebe2c0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { }; const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) { - struct __arch_jmp_op { - char op; + struct __arch_relative_insn { + u8 op; s32 raddr; - } __attribute__((packed)) * jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_OPCODE; + } __attribute__((packed)) *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } /* @@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) /* * Basically, kp->ainsn.insn has an original instruction. * However, RIP-relative instruction can not do single-stepping - * at different place, fix_riprel() tweaks the displacement of + * at different place, __copy_instruction() tweaks the displacement of * that instruction. In that case, we can't recover the instruction * from the kp->ainsn.insn. * @@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) } /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. * If it does, Return the address of the 32-bit displacement word. * If not, return null. * Only applicable to 64-bit x86. */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { -#ifdef CONFIG_X86_64 struct insn insn; - kernel_insn_init(&insn, p->ainsn.insn); + int ret; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + kernel_insn_init(&insn, src); + if (recover) { + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, + (unsigned long)src); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + } + insn_get_length(&insn); + memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; + kernel_insn_init(&insn, dest); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing @@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) p->addr + (s64) insn.displacement.value - - (u8 *) p->ainsn.insn; + newdisp = (u8 *) src + (s64) insn.displacement.value - + (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ - disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; } #endif + return insn.length; } static void __kprobes arch_copy_kprobe(struct kprobe *p) { - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - - fix_riprel(p); + /* + * Copy an instruction without recovering int3, because it will be + * put by another subsystem. + */ + __copy_instruction(p->ainsn.insn, p->addr, 0); if (can_boost(p->addr)) p->ainsn.boostable = 0; @@ -417,9 +443,20 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } +#ifdef CONFIG_OPTPROBES +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif + static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) { + if (setup_detour_execution(p, regs, reenter)) + return; + #if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ @@ -815,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p, * These instructions can be executed directly if it * jumps back to correct address. */ - set_jmp_op((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); p->ainsn.boostable = 1; } else { p->ainsn.boostable = -1; @@ -1043,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, + unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +void __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __get_cpu_var(current_kprobe) = &op->kp; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __get_cpu_var(current_kprobe) = NULL; + } + preempt_enable_no_resched(); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len, 1); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + int ret; + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + /* Dummy buffers for lookup_symbol_attrs */ + static char __dummy_buf[KSYM_NAME_LEN]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a breakpoint (int3) with a relative jump. */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ + unsigned char jmp_code[RELATIVEJUMP_SIZE]; + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + jmp_code[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&jmp_code[1]) = rel; + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +#endif + int __init arch_init_kprobes(void) { return 0; -- cgit v1.2.2 From a92d152ef9dd89c578ca2ec7118e9de8fb74a75f Mon Sep 17 00:00:00 2001 From: "Pan, Jacob jun" Date: Wed, 24 Feb 2010 16:59:55 -0800 Subject: x86, numaq: Make CONFIG_X86_NUMAQ depend on CONFIG_PCI The NUMAQ initialization sets x86_init.pci.init to pci_numaq_init, which obviously isn't defined if CONFIG_PCI isn't defined. This dependency was implicit in the past, because pci_numaq_init was invoked from arch/x86/pci/legacy.c, which itself was conditioned on CONFIG_PCI. I suspect that no NUMA-Q machines without PCI were ever built, so instead of complicating the code by adding #ifdefs or stub functions, just disable this bit of the configuration space. [ hpa: rewrote the checkin comment ] Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A321EE1F@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0ab2dcef7d84..f0322949328e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -425,6 +425,7 @@ config X86_32_NON_STANDARD config X86_NUMAQ bool "NUMAQ (IBM/Sequent)" depends on X86_32_NON_STANDARD + depends on PCI select NUMA select X86_MPPARSE ---help--- -- cgit v1.2.2 From 722a639fd2cec44501c04ae32af57fd822c5a2d5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Feb 2010 12:39:37 -0800 Subject: x86, pci: Exclude Moorestown PCI code if CONFIG_X86_MRST=n If we don't have any Moorestown CPU support compiled in, we don't need the Moorestown PCI support either. Signed-off-by: Yinghai Lu LKML-Reference: <4B858E89.7040807@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/pci/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 4753ebc19cae..56caf2a18baa 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -13,7 +13,9 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-y += common.o early.o mrst.o +obj-$(CONFIG_X86_MRST) += mrst.o + +obj-y += common.o early.o obj-y += amd_bus.o obj-$(CONFIG_X86_64) += bus_numa.o -- cgit v1.2.2 From c1fd1b43831fa20c91cdd461342af8edf2e87c2f Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 24 Feb 2010 17:04:47 +0200 Subject: x86, mm: Unify kernel_physical_mapping_init() API This patch changes the 32-bit version of kernel_physical_mapping_init() to return the last mapped address like the 64-bit one so that we can unify the call-site in init_memory_mapping(). Cc: Yinghai Lu Cc: KAMEZAWA Hiroyuki Signed-off-by: Pekka Enberg LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 7 ------- arch/x86/mm/init_32.c | 8 +++++--- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d406c5239019..e71c5cbc8f35 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -266,16 +266,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); -#ifdef CONFIG_X86_32 - for (i = 0; i < nr_range; i++) - kernel_physical_mapping_init(mr[i].start, mr[i].end, - mr[i].page_size_mask); - ret = end; -#else /* CONFIG_X86_64 */ for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#endif #ifdef CONFIG_X86_32 early_ioremap_page_table_range_init(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9a0c258a86be..2226f2c70ea3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask) { int use_pse = page_size_mask == (1< Date: Thu, 25 Feb 2010 10:02:14 -0800 Subject: x86, mrst: Add Kconfig dependencies for Moorestown The Moorestown platform requires IOAPIC for all interrupts from the south complex, since there is no legacy PIC. Furthermore, Moorestown I/O requires PCI. Moorestown PCI depends on PCI MMCONFIG and DIRECT method to perform device enumeration, as there is no PCI BIOS. [ hpa: rewrote commit message ] Signed-off-by: Jacob Pan LKML-Reference: <1267120934-9505-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f0322949328e..2697fdb25ac2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -388,8 +388,12 @@ config X86_ELAN config X86_MRST bool "Moorestown MID platform" + depends on PCI + depends on PCI_GOANY + depends on PCI_IOAPIC depends on X86_32 depends on X86_EXTENDED_PLATFORM + depends on X86_IO_APIC select APB_TIMER ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin -- cgit v1.2.2 From 4fb6088a5cb3a77123fea1279bf2d5b16cf27648 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Feb 2010 05:38:38 -0800 Subject: x86, pci: Add arch_init to x86_init abstraction Added an abstraction function for arch specific init calls. Signed-off-by: Jacob Pan Cc: Jesse Barnes LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318CE84@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/x86_init.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8ef56f21f9f0..b1d62bbcae3d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -98,13 +98,15 @@ struct x86_init_iommu { int (*iommu_init)(void); }; - /* - * struct x86_init_pci - platform specific pci init functions - * @init: platform specific pci init +/** + * struct x86_init_pci - platform specific pci init functions + * @arch_init: platform specific pci arch init call + * @init: platform specific pci subsystem init * @init_irq: platform specific pci irq init * @fixup_irqs: platform specific pci irq fixup */ struct x86_init_pci { + int (*arch_init)(void); int (*init)(void); void (*init_irq)(void); void (*fixup_irqs)(void); -- cgit v1.2.2 From d5d0e88c1e5b069aadb050ff6ec95df312de876a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Feb 2010 05:42:04 -0800 Subject: x86, olpc: Use pci subarch init for OLPC Replace the #ifdef'ed OLPC-specific init functions by a conditional x86_init function. If the function returns 0 we leave pci_arch_init, otherwise we continue. Signed-off-by: Thomas Gleixner Cc: Jesse Barnes Cc: Andres Salomon LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318CE89@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/olpc.h | 20 ++------------------ arch/x86/include/asm/pci_x86.h | 1 - arch/x86/kernel/olpc.c | 10 +++++++--- arch/x86/pci/init.c | 8 ++++---- arch/x86/pci/olpc.c | 3 --- 5 files changed, 13 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 3a57385d9fa7..101229b0d8ed 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -13,7 +13,6 @@ struct olpc_platform_t { #define OLPC_F_PRESENT 0x01 #define OLPC_F_DCON 0x02 -#define OLPC_F_VSA 0x04 #ifdef CONFIG_OLPC @@ -50,18 +49,6 @@ static inline int olpc_has_dcon(void) return (olpc_platform_info.flags & OLPC_F_DCON) ? 1 : 0; } -/* - * The VSA is software from AMD that typical Geode bioses will include. - * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does - * not include the VSA; instead, PCI is emulated by the kernel. - * - * The VSA is described further in arch/x86/pci/olpc.c. - */ -static inline int olpc_has_vsa(void) -{ - return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0; -} - /* * The "Mass Production" version of OLPC's XO is identified as being model * C2. During the prototype phase, the following models (in chronological @@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void) return 0; } -static inline int olpc_has_vsa(void) -{ - return 0; -} - #endif +extern int pci_olpc_init(void); + /* EC related functions */ extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 6e69edfbf074..36085518badb 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -104,7 +104,6 @@ extern bool port_cf9_safe; extern int pci_direct_probe(void); extern void pci_direct_init(int type); extern void pci_pcbios_init(void); -extern int pci_olpc_init(void); extern void __init dmi_check_pciprobe(void); extern void __init dmi_check_skip_isa_align(void); diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 9d1d263f786f..8297160c41b3 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -17,7 +17,9 @@ #include #include #include + #include +#include #include #ifdef CONFIG_OPEN_FIRMWARE @@ -243,9 +245,11 @@ static int __init olpc_init(void) olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, (unsigned char *) &olpc_platform_info.ecver, 1); - /* check to see if the VSA exists */ - if (cs5535_has_vsa2()) - olpc_platform_info.flags |= OLPC_F_VSA; +#ifdef CONFIG_PCI_OLPC + /* If the VSA exists let it emulate PCI, if not emulate in kernel */ + if (!cs5535_has_vsa2()) + x86_init.pci.arch_init = pci_olpc_init; +#endif printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 25a1f8efed4a..adb62aaa7ecd 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -1,6 +1,7 @@ #include #include #include +#include /* arch_initcall has too random ordering, so call the initializers in the right sequence from here. */ @@ -15,10 +16,9 @@ static __init int pci_arch_init(void) if (!(pci_probe & PCI_PROBE_NOEARLY)) pci_mmcfg_early_init(); -#ifdef CONFIG_PCI_OLPC - if (!pci_olpc_init()) - return 0; /* skip additional checks if it's an XO */ -#endif + if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) + return 0; + #ifdef CONFIG_PCI_BIOS pci_pcbios_init(); #endif diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index b889d824f7c6..b34815408f58 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = { int __init pci_olpc_init(void) { - if (!machine_is_olpc() || olpc_has_vsa()) - return -ENODEV; - printk(KERN_INFO "PCI: Using configuration type OLPC\n"); raw_pci_ops = &pci_olpc_conf; is_lx = is_geode_lx(); -- cgit v1.2.2 From fb90ef93df654f2678933efbbf864adac0ae490e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Feb 2010 18:36:53 -0800 Subject: early_res: Add free_early_partial() To free partial areas in pcpu_setup... Reported-by: Peter Zijlstra Signed-off-by: Yinghai Lu Cc: Tejun Heo Cc: Christoph Lameter Cc: Stephen Rothwell Cc: Linus Torvalds Cc: Jesse Barnes Cc: Pekka Enberg LKML-Reference: <4B85E245.5030001@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 35abcb8b00e9..ef6370b00e70 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { +#ifdef CONFIG_NO_BOOTMEM + u64 start = __pa(ptr); + u64 end = start + size; + free_early_partial(start, end); +#else free_bootmem(__pa(ptr), size); +#endif } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) -- cgit v1.2.2 From d76a0812ac4139ceb54daab3cc70e1bd8bd9d43a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Feb 2010 17:06:01 +0200 Subject: perf_events: Add new start/stop PMU callbacks In certain situations, the kernel may need to stop and start the same event rapidly. The current PMU callbacks do not distinguish between stop and release (i.e., stop + free the resource). Thus, a counter may be released, then it will be immediately re-acquired. Event scheduling will again take place with no guarantee to assign the same counter. On some processors, this may event yield to failure to assign the event back due to competion between cores. This patch is adding a new pair of callback to stop and restart a counter without actually release the underlying counter resource. On stop, the counter is stopped, its values saved and that's it. On start, the value is reloaded and counter is restarted (on x86, actual restart is delayed until perf_enable()). Signed-off-by: Stephane Eranian [ added fallback to ->enable/->disable for all other PMUs fixed x86_pmu_start() to call x86_pmu.enable() merged __x86_pmu_disable into x86_pmu_stop() ] Signed-off-by: Peter Zijlstra LKML-Reference: <4b703875.0a04d00a.7896.ffffb824@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a920f173a220..9173ea95f918 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1495,7 +1495,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, hwc->last_tag == cpuc->tags[i]; } -static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc); +static void x86_pmu_stop(struct perf_event *event); void hw_perf_enable(void) { @@ -1533,7 +1533,7 @@ void hw_perf_enable(void) match_prev_assignment(hwc, cpuc, i)) continue; - __x86_pmu_disable(event, cpuc); + x86_pmu_stop(event); hwc->idx = -1; } @@ -1801,6 +1801,19 @@ static int x86_pmu_enable(struct perf_event *event) return 0; } +static int x86_pmu_start(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx == -1) + return -EAGAIN; + + x86_perf_event_set_period(event, hwc, hwc->idx); + x86_pmu.enable(hwc, hwc->idx); + + return 0; +} + static void x86_pmu_unthrottle(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1924,8 +1937,9 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) event->pending_kill = POLL_IN; } -static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc) +static void x86_pmu_stop(struct perf_event *event) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; @@ -1954,7 +1968,7 @@ static void x86_pmu_disable(struct perf_event *event) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int i; - __x86_pmu_disable(event, cpuc); + x86_pmu_stop(event); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { @@ -2667,6 +2681,8 @@ static inline void x86_pmu_read(struct perf_event *event) static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, + .start = x86_pmu_start, + .stop = x86_pmu_stop, .read = x86_pmu_read, .unthrottle = x86_pmu_unthrottle, }; -- cgit v1.2.2 From 38331f62c20456454eed9ebea2525f072c6f1d2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Feb 2010 17:17:01 +0200 Subject: perf_events, x86: AMD event scheduling This patch adds correct AMD NorthBridge event scheduling. NB events are events measuring L3 cache, Hypertransport traffic. They are identified by an event code >= 0xe0. They measure events on the Northbride which is shared by all cores on a package. NB events are counted on a shared set of counters. When a NB event is programmed in a counter, the data actually comes from a shared counter. Thus, access to those counters needs to be synchronized. We implement the synchronization such that no two cores can be measuring NB events using the same counters. Thus, we maintain a per-NB allocation table. The available slot is propagated using the event_constraint structure. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4b703957.0702d00a.6bf2.7b7d@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 265 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 262 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9173ea95f918..aa12f36e4711 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -80,6 +80,13 @@ struct event_constraint { int weight; }; +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + struct cpu_hw_events { struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; @@ -92,6 +99,7 @@ struct cpu_hw_events { int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct amd_nb *amd_nb; }; #define __EVENT_CONSTRAINT(c, n, m, w) {\ @@ -153,6 +161,8 @@ struct x86_pmu { static struct x86_pmu x86_pmu __read_mostly; +static raw_spinlock_t amd_nb_lock; + static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -802,7 +812,7 @@ static u64 amd_pmu_event_map(int hw_event) static u64 amd_pmu_raw_event(u64 hw_event) { -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL #define K7_EVNTSEL_INV_MASK 0x000800000ULL @@ -2210,6 +2220,7 @@ perf_event_nmi_handler(struct notifier_block *self, } static struct event_constraint unconstrained; +static struct event_constraint emptyconstraint; static struct event_constraint bts_constraint = EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); @@ -2249,10 +2260,146 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event return &unconstrained; } +/* + * AMD64 events are detected based on their event codes. + */ +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * only care about NB events + */ + if (!(nb && amd_is_nb_event(hwc))) + return; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_events; i++) { + if (nb->owners[i] == event) { + cmpxchg(nb->owners+i, event, NULL); + break; + } + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling amd_put_event_constraints(). + * + * Non NB events are not impacted by this restriction. + */ static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - return &unconstrained; + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old = NULL; + int max = x86_pmu.num_events; + int i, j, k = -1; + + /* + * if not NB event or no NB, then no constraints + */ + if (!(nb && amd_is_nb_event(hwc))) + return &unconstrained; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for (i = 0; i < max; i++) { + /* + * keep track of first free slot + */ + if (k == -1 && !nb->owners[i]) + k = i; + + /* already present, reuse */ + if (nb->owners[i] == event) + goto done; + } + /* + * not present, so grab a new slot + * starting either at: + */ + if (hwc->idx != -1) { + /* previous assignment */ + i = hwc->idx; + } else if (k != -1) { + /* start from free slot found */ + i = k; + } else { + /* + * event not found, no slot found in + * first pass, try again from the + * beginning + */ + i = 0; + } + j = i; + do { + old = cmpxchg(nb->owners+i, NULL, event); + if (!old) + break; + if (++i == max) + i = 0; + } while (i != j); +done: + if (!old) + return &nb->event_constraints[i]; + + return &emptyconstraint; } static int x86_event_sched_in(struct perf_event *event, @@ -2465,7 +2612,8 @@ static __initconst struct x86_pmu amd_pmu = { .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints }; static __init int p6_pmu_init(void) @@ -2589,6 +2737,91 @@ static __init int intel_pmu_init(void) return 0; } +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +{ + struct amd_nb *nb; + int i; + + nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + if (!nb) + return NULL; + + memset(nb, 0, sizeof(*nb)); + nb->nb_id = nb_id; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_events; i++) { + set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static void amd_pmu_cpu_online(int cpu) +{ + struct cpu_hw_events *cpu1, *cpu2; + struct amd_nb *nb = NULL; + int i, nb_id; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + /* + * function may be called too early in the + * boot process, in which case nb_id is bogus + */ + nb_id = amd_get_nb_id(cpu); + if (nb_id == BAD_APICID) + return; + + cpu1 = &per_cpu(cpu_hw_events, cpu); + cpu1->amd_nb = NULL; + + raw_spin_lock(&amd_nb_lock); + + for_each_online_cpu(i) { + cpu2 = &per_cpu(cpu_hw_events, i); + nb = cpu2->amd_nb; + if (!nb) + continue; + if (nb->nb_id == nb_id) + goto found; + } + + nb = amd_alloc_nb(cpu, nb_id); + if (!nb) { + pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); + raw_spin_unlock(&amd_nb_lock); + return; + } +found: + nb->refcnt++; + cpu1->amd_nb = nb; + + raw_spin_unlock(&amd_nb_lock); +} + +static void amd_pmu_cpu_offline(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock(&amd_nb_lock); + + if (--cpuhw->amd_nb->refcnt == 0) + kfree(cpuhw->amd_nb); + + cpuhw->amd_nb = NULL; + + raw_spin_unlock(&amd_nb_lock); +} + static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ @@ -2601,6 +2834,11 @@ static __init int amd_pmu_init(void) memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + /* + * explicitly initialize the boot cpu, other cpus will get + * the cpu hotplug callbacks from smp_init() + */ + amd_pmu_cpu_online(smp_processor_id()); return 0; } @@ -2934,4 +3172,25 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) void hw_perf_event_setup_online(int cpu) { init_debug_store_on_cpu(cpu); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + amd_pmu_cpu_online(cpu); + break; + default: + return; + } +} + +void hw_perf_event_setup_offline(int cpu) +{ + init_debug_store_on_cpu(cpu); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + amd_pmu_cpu_offline(cpu); + break; + default: + return; + } } -- cgit v1.2.2 From 6e37738a2fac964583debe91099bc3248554f6e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Feb 2010 13:21:58 +0100 Subject: perf_events: Simplify code by removing cpu argument to hw_perf_group_sched_in() Since the cpu argument to hw_perf_group_sched_in() is always smp_processor_id(), simplify the code a little by removing this argument and using the current cpu where needed. Signed-off-by: Peter Zijlstra Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker LKML-Reference: <1265890918.5396.3.camel@laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index aa12f36e4711..ad096562d694 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2403,12 +2403,12 @@ done: } static int x86_event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { int ret = 0; event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = cpu; + event->oncpu = smp_processor_id(); event->tstamp_running += event->ctx->time - event->tstamp_stopped; if (!is_x86_event(event)) @@ -2424,7 +2424,7 @@ static int x86_event_sched_in(struct perf_event *event, } static void x86_event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; @@ -2452,9 +2452,9 @@ static void x86_event_sched_out(struct perf_event *event, */ int hw_perf_group_sched_in(struct perf_event *leader, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, int cpu) + struct perf_event_context *ctx) { - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct perf_event *sub; int assign[X86_PMC_IDX_MAX]; int n0, n1, ret; @@ -2468,14 +2468,14 @@ int hw_perf_group_sched_in(struct perf_event *leader, if (ret) return ret; - ret = x86_event_sched_in(leader, cpuctx, cpu); + ret = x86_event_sched_in(leader, cpuctx); if (ret) return ret; n1 = 1; list_for_each_entry(sub, &leader->sibling_list, group_entry) { if (sub->state > PERF_EVENT_STATE_OFF) { - ret = x86_event_sched_in(sub, cpuctx, cpu); + ret = x86_event_sched_in(sub, cpuctx); if (ret) goto undo; ++n1; @@ -2500,11 +2500,11 @@ int hw_perf_group_sched_in(struct perf_event *leader, */ return 1; undo: - x86_event_sched_out(leader, cpuctx, cpu); + x86_event_sched_out(leader, cpuctx); n0 = 1; list_for_each_entry(sub, &leader->sibling_list, group_entry) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { - x86_event_sched_out(sub, cpuctx, cpu); + x86_event_sched_out(sub, cpuctx); if (++n0 == n1) break; } -- cgit v1.2.2 From 6667661df4bc76083edf1e08831c20f64429709d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Feb 2010 16:10:48 +0100 Subject: perf_events, x86: Remove superflous MSR writes We re-program the event control register every time we reset the count, this appears to be superflous, hence remove it. Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ad096562d694..dd09ccc867d3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2009,9 +2009,6 @@ static int intel_pmu_save_and_restart(struct perf_event *event) x86_perf_event_update(event, hwc, idx); ret = x86_perf_event_set_period(event, hwc, idx); - if (event->state == PERF_EVENT_STATE_ACTIVE) - intel_pmu_enable_event(hwc, idx); - return ret; } -- cgit v1.2.2 From 013cfc50672bbb638796545231683231647edb07 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 28 Jan 2010 18:05:26 +0100 Subject: oprofile/x86: remove OPROFILE_IBS config option OProfile support for IBS is now for several versions in the kernel. The feature is stable now and the code can be activated permanently. As a side effect IBS now works also on nosmp configs. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 39686c29f03a..2b9c68d868ed 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "op_x86_model.h" #include "op_counter.h" @@ -43,8 +44,6 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; -#ifdef CONFIG_OPROFILE_IBS - /* IbsFetchCtl bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) @@ -72,8 +71,6 @@ struct op_ibs_config { static struct op_ibs_config ibs_config; -#endif - #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX static void op_mux_fill_in_addresses(struct op_msrs * const msrs) @@ -185,8 +182,6 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } } -#ifdef CONFIG_OPROFILE_IBS - static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) @@ -272,15 +267,6 @@ static void op_amd_stop_ibs(void) wrmsrl(MSR_AMD64_IBSOPCTL, 0); } -#else - -static inline void op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) { } -static inline void op_amd_start_ibs(void) { } -static inline void op_amd_stop_ibs(void) { } - -#endif - static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { @@ -355,8 +341,6 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifdef CONFIG_OPROFILE_IBS - static u8 ibs_eilvt_off; static inline void apic_init_ibs_nmi_per_cpu(void *arg) @@ -507,19 +491,6 @@ static void op_amd_exit(void) ibs_exit(); } -#else - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#endif /* CONFIG_OPROFILE_IBS */ - struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, -- cgit v1.2.2 From 89baaaa98a10cad5cc8516c7208b02d9fc711890 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 28 Jan 2010 16:50:45 +0100 Subject: oprofile/x86: remove node check in AMD IBS initialization Standard AMD systems have the same number of nodes as there are northbridge devices. However, there may kernel configurations (especially for 32 bit) or system setups exist, where the node number is different or it can not be detected properly. Thus the check is not reliable and may fail though IBS setup was fine. For this reason it is better to remove the check. Cc: stable Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 2b9c68d868ed..4eb30715b1d5 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -389,16 +389,6 @@ static int init_ibs_nmi(void) return 1; } -#ifdef CONFIG_NUMA - /* Sanity check */ - /* Works only for 64bit with proper numa implementation. */ - if (nodes != num_possible_nodes()) { - printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " - "found: %d, expected %d", - nodes, num_possible_nodes()); - return 1; - } -#endif return 0; } -- cgit v1.2.2 From 64683da6643e8c6c93f1f99548399b08c029fd13 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 4 Feb 2010 10:57:23 +0100 Subject: oprofile/x86: implement IBS cpuid feature detection This patch adds IBS feature detection using cpuid flags. An IBS capability mask is introduced to test for certain IBS features. The bit mask is the same as for IBS cpuid feature flags (Fn8000_001B_EAX), but bit 0 is used to indicate the existence of IBS. The patch also changes the handling of the IbsOpCntCtl bit (periodic op counter count control). The oprofilefs file for this feature (ibs_op/dispatched_ops) will be only exposed if the feature is available, also the default for the bit is set to count clock cycles. In general, the userland can detect the availability of a feature by checking for the corresponding file in oprofilefs. If it exists, the feature also exists. This may lead to a dynamic file layout depending on the cpu type with that the userland has to deal with. Current opcontrol is compatible. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 80 +++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 4eb30715b1d5..6557683c190e 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include "op_x86_model.h" #include "op_counter.h" @@ -58,7 +60,7 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 -static int has_ibs; /* AMD Family10h and later */ +static u32 ibs_caps; struct op_ibs_config { unsigned long op_enabled; @@ -71,6 +73,40 @@ struct op_ibs_config { static struct op_ibs_config ibs_config; +/* + * IBS cpuid feature detection + */ + +#define IBS_CPUID_FEATURES 0x8000001b + +/* + * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but + * bit 0 is used to indicate the existence of IBS. + */ +#define IBS_CAPS_AVAIL (1LL<<0) +#define IBS_CAPS_OPCNT (1LL<<4) + +static u32 get_ibs_caps(void) +{ + u32 ibs_caps; + unsigned int max_level; + + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_AVAIL; + + ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(ibs_caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_AVAIL; + + return ibs_caps; +} + #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX static void op_mux_fill_in_addresses(struct op_msrs * const msrs) @@ -189,7 +225,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, u64 val, ctl; struct op_entry entry; - if (!has_ibs) + if (!ibs_caps) return; if (ibs_config.fetch_enabled) { @@ -241,16 +277,21 @@ op_amd_handle_ibs(struct pt_regs * const regs, static inline void op_amd_start_ibs(void) { u64 val; - if (has_ibs && ibs_config.fetch_enabled) { + + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) { val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); } - if (has_ibs && ibs_config.op_enabled) { + if (ibs_config.op_enabled) { val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; - val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; + if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) + val |= IBS_OP_CNT_CTL; val |= IBS_OP_ENABLE; wrmsrl(MSR_AMD64_IBSOPCTL, val); } @@ -258,11 +299,14 @@ static inline void op_amd_start_ibs(void) static void op_amd_stop_ibs(void) { - if (has_ibs && ibs_config.fetch_enabled) + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); - if (has_ibs && ibs_config.op_enabled) + if (ibs_config.op_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSOPCTL, 0); } @@ -395,29 +439,30 @@ static int init_ibs_nmi(void) /* uninitialize the APIC for the IBS interrupts if needed */ static void clear_ibs_nmi(void) { - if (has_ibs) + if (ibs_caps) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } /* initialize the APIC for the IBS interrupts if available */ static void ibs_init(void) { - has_ibs = boot_cpu_has(X86_FEATURE_IBS); + ibs_caps = get_ibs_caps(); - if (!has_ibs) + if (!ibs_caps) return; if (init_ibs_nmi()) { - has_ibs = 0; + ibs_caps = 0; return; } - printk(KERN_INFO "oprofile: AMD IBS detected\n"); + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", + (unsigned)ibs_caps); } static void ibs_exit(void) { - if (!has_ibs) + if (!ibs_caps) return; clear_ibs_nmi(); @@ -437,7 +482,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) if (ret) return ret; - if (!has_ibs) + if (!ibs_caps) return ret; /* model specific files */ @@ -447,7 +492,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) ibs_config.fetch_enabled = 0; ibs_config.max_cnt_op = 250000; ibs_config.op_enabled = 0; - ibs_config.dispatched_ops = 1; + ibs_config.dispatched_ops = 0; dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); oprofilefs_create_ulong(sb, dir, "enable", @@ -462,8 +507,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) &ibs_config.op_enabled); oprofilefs_create_ulong(sb, dir, "max_count", &ibs_config.max_cnt_op); - oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); + if (ibs_caps & IBS_CAPS_OPCNT) + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); return 0; } -- cgit v1.2.2 From f125be1469303f7b9324447f251d74a0da24952f Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 18 Jan 2010 11:25:45 -0600 Subject: oprofile/x86: implement lsfr pseudo-random number generator for IBS This patch implements a linear feedback shift register (LFSR) for pseudo-random number generation for IBS. For IBS measurements it would be good to minimize memory traffic in the interrupt handler since every access pollutes the data caches. Computing a maximal period LFSR just needs shifts and ORs. The LFSR method is good enough to randomize the ops at low overhead. 16 pseudo-random bits are enough for the implementation and it doesn't matter that the pattern repeats with a fairly short cycle. It only needs to break up (hard) periodic sampling behavior. The logic was designed by Paul Drongowski. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 6557683c190e..97c84ebe3f24 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -218,6 +218,29 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } } +/* + * 16-bit Linear Feedback Shift Register (LFSR) + * + * 16 14 13 11 + * Feedback polynomial = X + X + X + X + 1 + */ +static unsigned int lfsr_random(void) +{ + static unsigned int lfsr_value = 0xF00D; + unsigned int bit; + + /* Compute next bit to shift in */ + bit = ((lfsr_value >> 0) ^ + (lfsr_value >> 2) ^ + (lfsr_value >> 3) ^ + (lfsr_value >> 5)) & 0x0001; + + /* Advance to next register value */ + lfsr_value = (lfsr_value >> 1) | (bit << 15); + + return lfsr_value; +} + static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) -- cgit v1.2.2 From ba52078e1917c5116c0802298d88ad0e54a6728b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 23 Feb 2010 15:46:49 +0100 Subject: oprofile/x86: implement randomization for IBS periodic op counter IBS selects an op (execution operation) for sampling by counting either cycles or dispatched ops. Better statistical samples can be produced by adding a software generated random offset to the periodic op counter value with each sample. This patch adds software randomization to the IBS periodic op counter. The lower 12 bits of the 20 bit counter are randomized. IbsOpCurCnt is initialized with a 12 bit random value. There is a work around if the hw can not write to IbsOpCurCnt. Then the lower 8 bits of the 16 bit IbsOpMaxCnt [15:0] value are randomized in the range of -128 to +127 by adding/subtracting an offset to the maximum count (IbsOpMaxCnt). The linear feedback shift register (LFSR) algorithm is used for pseudo-random number generation to have low impact to the memory system. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 69 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 97c84ebe3f24..a9d194734a8e 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -52,7 +52,7 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT_MASK 0xFFFF0000ULL -/*IbsOpCtl bits */ +/* IbsOpCtl bits */ #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) @@ -72,6 +72,7 @@ struct op_ibs_config { }; static struct op_ibs_config ibs_config; +static u64 ibs_op_ctl; /* * IBS cpuid feature detection @@ -84,8 +85,16 @@ static struct op_ibs_config ibs_config; * bit 0 is used to indicate the existence of IBS. */ #define IBS_CAPS_AVAIL (1LL<<0) +#define IBS_CAPS_RDWROPCNT (1LL<<3) #define IBS_CAPS_OPCNT (1LL<<4) +/* + * IBS randomization macros + */ +#define IBS_RANDOM_BITS 12 +#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) +#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) + static u32 get_ibs_caps(void) { u32 ibs_caps; @@ -241,6 +250,38 @@ static unsigned int lfsr_random(void) return lfsr_value; } +/* + * IBS software randomization + * + * The IBS periodic op counter is randomized in software. The lower 12 + * bits of the 20 bit counter are randomized. IbsOpCurCnt is + * initialized with a 12 bit random value. + */ +static inline u64 op_amd_randomize_ibs_op(u64 val) +{ + unsigned int random = lfsr_random(); + + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) + /* + * Work around if the hw can not write to IbsOpCurCnt + * + * Randomize the lower 8 bits of the 16 bit + * IbsOpMaxCnt [15:0] value in the range of -128 to + * +127 by adding/subtracting an offset to the + * maximum count (IbsOpMaxCnt). + * + * To avoid over or underflows and protect upper bits + * starting at bit 16, the initial value for + * IbsOpMaxCnt must fit in the range from 0x0081 to + * 0xff80. + */ + val += (s8)(random >> 4); + else + val |= (u64)(random & IBS_RANDOM_MASK) << 32; + + return val; +} + static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) @@ -290,8 +331,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; - ctl |= IBS_OP_ENABLE; + ctl = op_amd_randomize_ibs_op(ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } @@ -312,10 +352,27 @@ static inline void op_amd_start_ibs(void) } if (ibs_config.op_enabled) { - val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; + ibs_op_ctl = ibs_config.max_cnt_op >> 4; + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { + /* + * IbsOpCurCnt not supported. See + * op_amd_randomize_ibs_op() for details. + */ + ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL); + } else { + /* + * The start value is randomized with a + * positive offset, we need to compensate it + * with the half of the randomized range. Also + * avoid underflows. + */ + ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, + 0xFFFFULL); + } if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) - val |= IBS_OP_CNT_CTL; - val |= IBS_OP_ENABLE; + ibs_op_ctl |= IBS_OP_CNT_CTL; + ibs_op_ctl |= IBS_OP_ENABLE; + val = op_amd_randomize_ibs_op(ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, val); } } -- cgit v1.2.2 From 98a2e73a0690b3610f049a64154d8145e5771713 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 23 Feb 2010 18:14:58 +0100 Subject: oprofile/x86: warn user if a counter is already active This patch generates a warning if a counter is already active. Implemented for AMD and P6 models. P4 is not supported. Cc: Naga Chumbalkar Cc: Shashi Belur Cc: Tony Jones Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 11 ++++++++++- arch/x86/oprofile/op_model_ppro.c | 11 ++++++++++- arch/x86/oprofile/op_x86_model.h | 11 +++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index a9d194734a8e..ef9d735dea35 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -194,9 +194,18 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0; i < NUM_CONTROLS; ++i) { - if (unlikely(!msrs->controls[i].addr)) + if (unlikely(!msrs->controls[i].addr)) { + if (counter_config[i].enabled && !smp_processor_id()) + /* + * counter is reserved, this is on all + * cpus, so report only for cpu #0 + */ + op_x86_warn_reserved(i); continue; + } rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 8eb05878554c..c344525ebb55 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -82,9 +82,18 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!msrs->controls[i].addr)) + if (unlikely(!msrs->controls[i].addr)) { + if (counter_config[i].enabled && !smp_processor_id()) + /* + * counter is reserved, this is on all + * cpus, so report only for cpu #0 + */ + op_x86_warn_reserved(i); continue; + } rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); } diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 7b8e75d16081..59fa2bdb0da3 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -57,6 +57,17 @@ struct op_x86_model_spec { struct op_counter_config; +static inline void op_x86_warn_in_use(int counter) +{ + pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", + counter, smp_processor_id()); +} + +static inline void op_x86_warn_reserved(int counter) +{ + pr_warning("oprofile: counter #%d is already reserved\n", counter); +} + extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); extern int op_x86_phys_to_virt(int phys); -- cgit v1.2.2 From 8588d1067147e14d1dd521fbadd1d2564f8cc794 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 23 Feb 2010 18:14:58 +0100 Subject: oprofile/x86: add comment to counter-in-use warning Currently, oprofile fails silently on platforms where a non-OS entity such as the system firmware "enables" and uses a performance counter. There is a warning in the code for this case. The warning indicates an already running counter. If oprofile doesn't collect data, then try using a different performance counter on your platform to monitor the desired event. Delete the counter from the desired event by editing the /usr/share/oprofile///events file. If the event cannot be monitored by any other counter, contact your hardware or BIOS vendor. Cc: Shashi Belur Cc: Tony Jones Signed-off-by: Naga Chumbalkar Signed-off-by: Robert Richter --- arch/x86/oprofile/op_x86_model.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 59fa2bdb0da3..ff82a755edd4 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -59,6 +59,15 @@ struct op_counter_config; static inline void op_x86_warn_in_use(int counter) { + /* + * The warning indicates an already running counter. If + * oprofile doesn't collect data, then try using a different + * performance counter on your platform to monitor the desired + * event. Delete counter #%d from the desired event by editing + * the /usr/share/oprofile/%s//events file. If the event + * cannot be monitored by any other counter, contact your + * hardware or BIOS vendor. + */ pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", counter, smp_processor_id()); } -- cgit v1.2.2 From 68dc819ce829f7e7977a56524e710473bdb55115 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 25 Feb 2010 19:16:46 +0100 Subject: oprofile/x86: fix perfctr nmi reservation for mulitplexing Multiple virtual counters share one physical counter. The reservation of virtual counters fails due to duplicate allocation of the same counter. The counters are already reserved. Thus, virtual counter reservation may removed at all. This also makes the code easier. Cc: stable@kernel.org Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 11 ++++++----- arch/x86/oprofile/op_model_amd.c | 19 ------------------- 2 files changed, 6 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3347f696edc7..7170d1e29896 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -179,7 +179,6 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) if (counter_config[i].enabled) { multiplex[i].saved = -(u64)counter_config[i].count; } else { - multiplex[i].addr = 0; multiplex[i].saved = 0; } } @@ -189,25 +188,27 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) { + struct op_msr *counters = msrs->counters; struct op_msr *multiplex = msrs->multiplex; int i; for (i = 0; i < model->num_counters; ++i) { int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - rdmsrl(multiplex[virt].addr, multiplex[virt].saved); + if (counters[i].addr) + rdmsrl(counters[i].addr, multiplex[virt].saved); } } static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) { + struct op_msr *counters = msrs->counters; struct op_msr *multiplex = msrs->multiplex; int i; for (i = 0; i < model->num_counters; ++i) { int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - wrmsrl(multiplex[virt].addr, multiplex[virt].saved); + if (counters[i].addr) + wrmsrl(counters[i].addr, multiplex[virt].saved); } } diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index ef9d735dea35..2aab018a7a56 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -118,19 +118,6 @@ static u32 get_ibs_caps(void) #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -static void op_mux_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = op_x86_virt_to_phys(i); - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; - else - msrs->multiplex[i].addr = 0; - } -} - static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, struct op_msrs const * const msrs) { @@ -149,10 +136,6 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, } } -#else - -static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } - #endif /* functions for op_amd_spec */ @@ -174,8 +157,6 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) else msrs->controls[i].addr = 0; } - - op_mux_fill_in_addresses(msrs); } static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, -- cgit v1.2.2 From c17c8fbf349482e89b57d1b800e83e9f4cf40c47 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 25 Feb 2010 20:20:25 +0100 Subject: oprofile/x86: use kzalloc() instead of kmalloc() Cc: stable@kernel.org Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 6 +++--- arch/x86/oprofile/op_model_amd.c | 4 ---- arch/x86/oprofile/op_model_p4.c | 6 ------ arch/x86/oprofile/op_model_ppro.c | 6 +----- 4 files changed, 4 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 7170d1e29896..2c505ee71014 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -159,7 +159,7 @@ static int nmi_setup_mux(void) for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).multiplex = - kmalloc(multiplex_size, GFP_KERNEL); + kzalloc(multiplex_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).multiplex) return 0; } @@ -304,11 +304,11 @@ static int allocate_msrs(void) int i; for_each_possible_cpu(i) { - per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, + per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).counters) return 0; - per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, + per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) return 0; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 2aab018a7a56..f4ebc4596da8 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -147,15 +147,11 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) for (i = 0; i < NUM_COUNTERS; i++) { if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; } for (i = 0; i < NUM_CONTROLS; i++) { if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; } } diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index ac6b354becdf..e6a160a4684a 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -394,12 +394,6 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs) setup_num_counters(); stag = get_stagger(); - /* initialize some registers */ - for (i = 0; i < num_counters; ++i) - msrs->counters[i].addr = 0; - for (i = 0; i < num_controls; ++i) - msrs->controls[i].addr = 0; - /* the counter & cccr registers we pay attention to */ for (i = 0; i < num_counters; ++i) { addr = p4_counters[VIRT_CTR(stag, i)].counter_address; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index c344525ebb55..5d1727ba409e 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -37,15 +37,11 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) for (i = 0; i < num_counters; i++) { if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; } for (i = 0; i < num_counters; i++) { if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; } } @@ -57,7 +53,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, int i; if (!reset_value) { - reset_value = kmalloc(sizeof(reset_value[0]) * num_counters, + reset_value = kzalloc(sizeof(reset_value[0]) * num_counters, GFP_ATOMIC); if (!reset_value) return; -- cgit v1.2.2 From cfc9c0b450176a077205ef39092f0dc1a04e020a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 26 Feb 2010 13:45:24 +0100 Subject: oprofile/x86: fix msr access to reserved counters During switching virtual counters there is access to perfctr msrs. If the counter is not available this fails due to an invalid address. This patch fixes this. Cc: stable@kernel.org Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index f4ebc4596da8..6a58256dce9f 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -127,7 +127,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) + if (!reset_value[virt]) continue; rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; @@ -163,7 +163,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* setup reset_value */ for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { - if (counter_config[i].enabled) + if (counter_config[i].enabled + && msrs->counters[op_x86_virt_to_phys(i)].addr) reset_value[i] = counter_config[i].count; else reset_value[i] = 0; @@ -197,9 +198,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) - continue; - if (!msrs->counters[i].addr) + if (!reset_value[virt]) continue; /* setup counter registers */ -- cgit v1.2.2 From f22f54f4491acd987a6c5a92de52b60ca8b58b61 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2010 12:05:05 +0100 Subject: perf_events, x86: Split PMU definitions into separate files Split amd,p6,intel into separate files so that we can easily deal with CONFIG_CPU_SUP_* things, needed to make things build now that perf_event.c relies on symbols from amd.c Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1524 +------------------------------- arch/x86/kernel/cpu/perf_event_amd.c | 416 +++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 971 ++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_p6.c | 157 ++++ 4 files changed, 1554 insertions(+), 1514 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_event_amd.c create mode 100644 arch/x86/kernel/cpu/perf_event_intel.c create mode 100644 arch/x86/kernel/cpu/perf_event_p6.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index dd09ccc867d3..641ccb9dddbc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -161,8 +161,6 @@ struct x86_pmu { static struct x86_pmu x86_pmu __read_mostly; -static raw_spinlock_t amd_nb_lock; - static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -170,140 +168,6 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { static int x86_perf_event_set_period(struct perf_event *event, struct hw_perf_event *hwc, int idx); -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Event setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_EVENT 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_REG_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_REG_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - -static struct event_constraint intel_p6_event_constraints[] = -{ - INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ - INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT_END -}; - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, -}; - -static struct event_constraint intel_core_event_constraints[] = -{ - INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_core2_event_constraints[] = -{ - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ - INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ - INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_nehalem_event_constraints[] = -{ - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ - INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ - INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ - INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ - INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ - INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ - INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_westmere_event_constraints[] = -{ - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ - INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ - INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_gen_event_constraints[] = -{ - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ - EVENT_CONSTRAINT_END -}; - -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} - /* * Generalized hw caching related hw_event table, filled * in on a per model basis. A value of 0 means @@ -319,515 +183,6 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static __initconst u64 westmere_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ - [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ - [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (INTEL_ARCH_EVTSEL_MASK | \ - INTEL_ARCH_UNIT_MASK | \ - INTEL_ARCH_EDGE_MASK | \ - INTEL_ARCH_INV_MASK | \ - INTEL_ARCH_CNT_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static __initconst u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_REG_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. @@ -1079,42 +434,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return 0; } -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - /* * Setup the hardware configuration for a given attr_type */ @@ -1223,26 +542,6 @@ static int __hw_perf_event_init(struct perf_event *event) return 0; } -static void p6_pmu_disable_all(void) -{ - u64 val; - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_disable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); -} - static void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1278,33 +577,6 @@ void hw_perf_disable(void) x86_pmu.disable_all(); } -static void p6_pmu_enable_all(void) -{ - unsigned long val; - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_enable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; - - if (WARN_ON_ONCE(!event)) - return; - - intel_pmu_enable_bts(event->hw.config); - } -} - static void x86_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1578,20 +850,6 @@ void hw_perf_enable(void) x86_pmu.enable_all(); } -static inline u64 intel_pmu_get_status(void) -{ - u64 status; - - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static inline void intel_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); -} - static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, @@ -1603,47 +861,6 @@ static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } -static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; - - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val = P6_NOP_EVENT; - - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - -static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); - return; - } - - x86_pmu_disable_event(hwc, idx); -} - static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* @@ -1702,70 +919,6 @@ x86_perf_event_set_period(struct perf_event *event, return ret; } -static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - - /* - * ANY bit is supported in v3 and up - */ - if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) - bits |= 0x4; - - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - - -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; - } - - __x86_pmu_enable_event(hwc, idx); -} - static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1887,66 +1040,6 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) -{ - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!event) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, event, ®s); - - if (perf_output_begin(&handle, event, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, event); - } - - perf_output_end(&handle); - - /* There's new data available. */ - event->hw.interrupts++; - event->pending_kill = POLL_IN; -} - static void x86_pmu_stop(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1966,10 +1059,6 @@ static void x86_pmu_stop(struct perf_event *event) */ x86_perf_event_update(event, hwc, idx); - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); - cpuc->events[idx] = NULL; } @@ -1996,114 +1085,6 @@ static void x86_pmu_disable(struct perf_event *event) perf_event_update_userpage(event); } -/* - * Save and restart an expired event. Called by NMI contexts, - * so it has to be careful about preempting normal event ops: - */ -static int intel_pmu_save_and_restart(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; - unsigned long flags; - int idx; - - if (!x86_pmu.num_events) - return; - - local_irq_save(flags); - - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); -} - -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - data.raw = NULL; - - cpuc = &__get_cpu_var(cpu_hw_events); - - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } - - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } - - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_event *event = cpuc->events[bit]; - - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; - - if (!intel_pmu_save_and_restart(event)) - continue; - - data.period = event->hw.last_period; - - if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); - } - - intel_pmu_ack_status(ack); - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; -} - static int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; @@ -2216,37 +1197,20 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, + .next = NULL, + .priority = 1 +}; + static struct event_constraint unconstrained; static struct event_constraint emptyconstraint; -static struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); - -static struct event_constraint * -intel_special_constraints(struct perf_event *event) -{ - unsigned int hw_event; - - hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (event->hw.sample_period == 1))) { - - return &bts_constraint; - } - return NULL; -} - static struct event_constraint * -intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { struct event_constraint *c; - c = intel_special_constraints(event); - if (c) - return c; - if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { if ((event->hw.config & c->cmask) == c->code) @@ -2257,148 +1221,6 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event return &unconstrained; } -/* - * AMD64 events are detected based on their event codes. - */ -static inline int amd_is_nb_event(struct hw_perf_event *hwc) -{ - return (hwc->config & 0xe0) == 0xe0; -} - -static void amd_put_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct amd_nb *nb = cpuc->amd_nb; - int i; - - /* - * only care about NB events - */ - if (!(nb && amd_is_nb_event(hwc))) - return; - - /* - * need to scan whole list because event may not have - * been assigned during scheduling - * - * no race condition possible because event can only - * be removed on one CPU at a time AND PMU is disabled - * when we come here - */ - for (i = 0; i < x86_pmu.num_events; i++) { - if (nb->owners[i] == event) { - cmpxchg(nb->owners+i, event, NULL); - break; - } - } -} - - /* - * AMD64 NorthBridge events need special treatment because - * counter access needs to be synchronized across all cores - * of a package. Refer to BKDG section 3.12 - * - * NB events are events measuring L3 cache, Hypertransport - * traffic. They are identified by an event code >= 0xe00. - * They measure events on the NorthBride which is shared - * by all cores on a package. NB events are counted on a - * shared set of counters. When a NB event is programmed - * in a counter, the data actually comes from a shared - * counter. Thus, access to those counters needs to be - * synchronized. - * - * We implement the synchronization such that no two cores - * can be measuring NB events using the same counters. Thus, - * we maintain a per-NB allocation table. The available slot - * is propagated using the event_constraint structure. - * - * We provide only one choice for each NB event based on - * the fact that only NB events have restrictions. Consequently, - * if a counter is available, there is a guarantee the NB event - * will be assigned to it. If no slot is available, an empty - * constraint is returned and scheduling will eventually fail - * for this event. - * - * Note that all cores attached the same NB compete for the same - * counters to host NB events, this is why we use atomic ops. Some - * multi-chip CPUs may have more than one NB. - * - * Given that resources are allocated (cmpxchg), they must be - * eventually freed for others to use. This is accomplished by - * calling amd_put_event_constraints(). - * - * Non NB events are not impacted by this restriction. - */ -static struct event_constraint * -amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct amd_nb *nb = cpuc->amd_nb; - struct perf_event *old = NULL; - int max = x86_pmu.num_events; - int i, j, k = -1; - - /* - * if not NB event or no NB, then no constraints - */ - if (!(nb && amd_is_nb_event(hwc))) - return &unconstrained; - - /* - * detect if already present, if so reuse - * - * cannot merge with actual allocation - * because of possible holes - * - * event can already be present yet not assigned (in hwc->idx) - * because of successive calls to x86_schedule_events() from - * hw_perf_group_sched_in() without hw_perf_enable() - */ - for (i = 0; i < max; i++) { - /* - * keep track of first free slot - */ - if (k == -1 && !nb->owners[i]) - k = i; - - /* already present, reuse */ - if (nb->owners[i] == event) - goto done; - } - /* - * not present, so grab a new slot - * starting either at: - */ - if (hwc->idx != -1) { - /* previous assignment */ - i = hwc->idx; - } else if (k != -1) { - /* start from free slot found */ - i = k; - } else { - /* - * event not found, no slot found in - * first pass, try again from the - * beginning - */ - i = 0; - } - j = i; - do { - old = cmpxchg(nb->owners+i, NULL, event); - if (!old) - break; - if (++i == max) - i = 0; - } while (i != j); -done: - if (!old) - return &nb->event_constraints[i]; - - return &emptyconstraint; -} - static int x86_event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx) { @@ -2509,335 +1331,9 @@ undo: return ret; } -static __read_mostly struct notifier_block perf_event_nmi_notifier = { - .notifier_call = perf_event_nmi_handler, - .next = NULL, - .priority = 1 -}; - -static __initconst struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = x86_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_event, - .disable = p6_pmu_disable_event, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_events = 2, - /* - * Events have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a event for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .event_bits = 32, - .event_mask = (1ULL << 32) - 1, - .get_event_constraints = intel_get_event_constraints, - .event_constraints = intel_p6_event_constraints -}; - -static __initconst struct x86_pmu core_pmu = { - .name = "core", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .get_event_constraints = intel_get_event_constraints, - .event_constraints = intel_core_event_constraints, -}; - -static __initconst struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_event, - .disable = intel_pmu_disable_event, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, - .get_event_constraints = intel_get_event_constraints -}; - -static __initconst struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints -}; - -static __init int p6_pmu_init(void) -{ - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - case 9: - case 13: - /* Pentium M */ - break; - default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } - - x86_pmu = p6_pmu; - - return 0; -} - -static __init int intel_pmu_init(void) -{ - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } - } - - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; - - version = eax.split.version_id; - if (version < 2) - x86_pmu = core_pmu; - else - x86_pmu = intel_pmu; - - x86_pmu.version = version; - x86_pmu.num_events = eax.split.num_events; - x86_pmu.event_bits = eax.split.bit_width; - x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; - - /* - * Quirk: v2 perfmon does not report fixed-purpose events, so - * assume at least 3 events: - */ - if (version > 1) - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); - - /* - * Install the hw-cache-events table: - */ - switch (boot_cpu_data.x86_model) { - case 14: /* 65 nm core solo/duo, "Yonah" */ - pr_cont("Core events, "); - break; - - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - x86_pmu.event_constraints = intel_core2_event_constraints; - pr_cont("Core2 events, "); - break; - - case 26: /* 45 nm nehalem, "Bloomfield" */ - case 30: /* 45 nm nehalem, "Lynnfield" */ - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - x86_pmu.event_constraints = intel_nehalem_event_constraints; - pr_cont("Nehalem/Corei7 events, "); - break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("Atom events, "); - break; - - case 37: /* 32 nm nehalem, "Clarkdale" */ - case 44: /* 32 nm nehalem, "Gulftown" */ - memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - x86_pmu.event_constraints = intel_westmere_event_constraints; - pr_cont("Westmere events, "); - break; - default: - /* - * default constraints for v2 and up - */ - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("generic architected perfmon, "); - } - return 0; -} - -static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) -{ - struct amd_nb *nb; - int i; - - nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); - if (!nb) - return NULL; - - memset(nb, 0, sizeof(*nb)); - nb->nb_id = nb_id; - - /* - * initialize all possible NB constraints - */ - for (i = 0; i < x86_pmu.num_events; i++) { - set_bit(i, nb->event_constraints[i].idxmsk); - nb->event_constraints[i].weight = 1; - } - return nb; -} - -static void amd_pmu_cpu_online(int cpu) -{ - struct cpu_hw_events *cpu1, *cpu2; - struct amd_nb *nb = NULL; - int i, nb_id; - - if (boot_cpu_data.x86_max_cores < 2) - return; - - /* - * function may be called too early in the - * boot process, in which case nb_id is bogus - */ - nb_id = amd_get_nb_id(cpu); - if (nb_id == BAD_APICID) - return; - - cpu1 = &per_cpu(cpu_hw_events, cpu); - cpu1->amd_nb = NULL; - - raw_spin_lock(&amd_nb_lock); - - for_each_online_cpu(i) { - cpu2 = &per_cpu(cpu_hw_events, i); - nb = cpu2->amd_nb; - if (!nb) - continue; - if (nb->nb_id == nb_id) - goto found; - } - - nb = amd_alloc_nb(cpu, nb_id); - if (!nb) { - pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); - raw_spin_unlock(&amd_nb_lock); - return; - } -found: - nb->refcnt++; - cpu1->amd_nb = nb; - - raw_spin_unlock(&amd_nb_lock); -} - -static void amd_pmu_cpu_offline(int cpu) -{ - struct cpu_hw_events *cpuhw; - - if (boot_cpu_data.x86_max_cores < 2) - return; - - cpuhw = &per_cpu(cpu_hw_events, cpu); - - raw_spin_lock(&amd_nb_lock); - - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); - - cpuhw->amd_nb = NULL; - - raw_spin_unlock(&amd_nb_lock); -} - -static __init int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; - - x86_pmu = amd_pmu; - - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - /* - * explicitly initialize the boot cpu, other cpus will get - * the cpu hotplug callbacks from smp_init() - */ - amd_pmu_cpu_online(smp_processor_id()); - return 0; -} +#include "perf_event_amd.c" +#include "perf_event_p6.c" +#include "perf_event_intel.c" static void __init pmu_check_apic(void) { diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c new file mode 100644 index 000000000000..6d28e08563e8 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -0,0 +1,416 @@ +#ifdef CONFIG_CPU_SUP_AMD + +static raw_spinlock_t amd_nb_lock; + +static __initconst u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +static u64 amd_pmu_raw_event(u64 hw_event) +{ +#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_REG_MASK) + + return hw_event & K7_EVNTSEL_MASK; +} + +/* + * AMD64 events are detected based on their event codes. + */ +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * only care about NB events + */ + if (!(nb && amd_is_nb_event(hwc))) + return; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_events; i++) { + if (nb->owners[i] == event) { + cmpxchg(nb->owners+i, event, NULL); + break; + } + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling amd_put_event_constraints(). + * + * Non NB events are not impacted by this restriction. + */ +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old = NULL; + int max = x86_pmu.num_events; + int i, j, k = -1; + + /* + * if not NB event or no NB, then no constraints + */ + if (!(nb && amd_is_nb_event(hwc))) + return &unconstrained; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for (i = 0; i < max; i++) { + /* + * keep track of first free slot + */ + if (k == -1 && !nb->owners[i]) + k = i; + + /* already present, reuse */ + if (nb->owners[i] == event) + goto done; + } + /* + * not present, so grab a new slot + * starting either at: + */ + if (hwc->idx != -1) { + /* previous assignment */ + i = hwc->idx; + } else if (k != -1) { + /* start from free slot found */ + i = k; + } else { + /* + * event not found, no slot found in + * first pass, try again from the + * beginning + */ + i = 0; + } + j = i; + do { + old = cmpxchg(nb->owners+i, NULL, event); + if (!old) + break; + if (++i == max) + i = 0; + } while (i != j); +done: + if (!old) + return &nb->event_constraints[i]; + + return &emptyconstraint; +} + +static __initconst struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints +}; + +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +{ + struct amd_nb *nb; + int i; + + nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + if (!nb) + return NULL; + + memset(nb, 0, sizeof(*nb)); + nb->nb_id = nb_id; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_events; i++) { + set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static void amd_pmu_cpu_online(int cpu) +{ + struct cpu_hw_events *cpu1, *cpu2; + struct amd_nb *nb = NULL; + int i, nb_id; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + /* + * function may be called too early in the + * boot process, in which case nb_id is bogus + */ + nb_id = amd_get_nb_id(cpu); + if (nb_id == BAD_APICID) + return; + + cpu1 = &per_cpu(cpu_hw_events, cpu); + cpu1->amd_nb = NULL; + + raw_spin_lock(&amd_nb_lock); + + for_each_online_cpu(i) { + cpu2 = &per_cpu(cpu_hw_events, i); + nb = cpu2->amd_nb; + if (!nb) + continue; + if (nb->nb_id == nb_id) + goto found; + } + + nb = amd_alloc_nb(cpu, nb_id); + if (!nb) { + pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); + raw_spin_unlock(&amd_nb_lock); + return; + } +found: + nb->refcnt++; + cpu1->amd_nb = nb; + + raw_spin_unlock(&amd_nb_lock); +} + +static void amd_pmu_cpu_offline(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock(&amd_nb_lock); + + if (--cpuhw->amd_nb->refcnt == 0) + kfree(cpuhw->amd_nb); + + cpuhw->amd_nb = NULL; + + raw_spin_unlock(&amd_nb_lock); +} + +static __init int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + /* + * explicitly initialize the boot cpu, other cpus will get + * the cpu hotplug callbacks from smp_init() + */ + amd_pmu_cpu_online(smp_processor_id()); + return 0; +} + +#else /* CONFIG_CPU_SUP_AMD */ + +static int amd_pmu_init(void) +{ + return 0; +} + +static void amd_pmu_cpu_online(int cpu) +{ +} + +static void amd_pmu_cpu_offline(int cpu) +{ +} + +#endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c new file mode 100644 index 000000000000..cf6590cf4a5f --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -0,0 +1,971 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static struct event_constraint intel_core_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +static __initconst u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 hw_event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (INTEL_ARCH_EVTSEL_MASK | \ + INTEL_ARCH_UNIT_MASK | \ + INTEL_ARCH_EDGE_MASK | \ + INTEL_ARCH_INV_MASK | \ + INTEL_ARCH_CNT_MASK) + + return hw_event & CORE_EVNTSEL_MASK; +} + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); +} + +static void intel_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_drain_bts_buffer(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + + data.period = event->hw.last_period; + data.addr = 0; + data.raw = NULL; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static inline void +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + intel_pmu_drain_bts_buffer(); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_event(hwc, idx); +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + __x86_pmu_enable_event(hwc, idx); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +static int intel_pmu_save_and_restart(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + int ret; + + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); + + return ret; +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + unsigned long flags; + int idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 ack, status; + + data.addr = 0; + data.raw = NULL; + + cpuc = &__get_cpu_var(cpu_hw_events); + + perf_disable(); + intel_pmu_drain_bts_buffer(); + status = intel_pmu_get_status(); + if (!status) { + perf_enable(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + perf_enable(); + return 1; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + data.period = event->hw.last_period; + + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + + perf_enable(); + + return 1; +} + +static struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + +static struct event_constraint * +intel_special_constraints(struct perf_event *event) +{ + unsigned int hw_event; + + hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (event->hw.sample_period == 1))) { + + return &bts_constraint; + } + return NULL; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + c = intel_special_constraints(event); + if (c) + return c; + + return x86_get_event_constraints(cpuc, event); +} + +static __initconst struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_core_event_constraints, +}; + +static __initconst struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, + .get_event_constraints = intel_get_event_constraints +}; + +static __init int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; + + x86_pmu.version = version; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + if (version > 1) + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_core2_event_constraints; + pr_cont("Core2 events, "); + break; + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_nehalem_event_constraints; + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("Atom events, "); + break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + pr_cont("Westmere events, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + } + return 0; +} + +#else /* CONFIG_CPU_SUP_INTEL */ + +static int intel_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c new file mode 100644 index 000000000000..1ca5ba078afd --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -0,0 +1,157 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 hw_event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_REG_MASK) + + return hw_event & P6_EVNTSEL_MASK; +} + +static struct event_constraint p6_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; + +static void p6_pmu_disable_all(void) +{ + u64 val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void p6_pmu_enable_all(void) +{ + unsigned long val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static inline void +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static __initconst struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = x86_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_events = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = p6_event_constraints, +}; + +static __init int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ -- cgit v1.2.2 From 1dd2980d990068e20045b90c424518cc7f3657ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2010 17:07:35 +0100 Subject: perf_event, amd: Fix spinlock initialization Avoid kernels from exploding on AMD machines when they have any lock debugging bits enabled. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 6d28e08563e8..8f3dbfda3c4f 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,6 +1,6 @@ #ifdef CONFIG_CPU_SUP_AMD -static raw_spinlock_t amd_nb_lock; +static DEFINE_RAW_SPINLOCK(amd_nb_lock); static __initconst u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] -- cgit v1.2.2 From 78c06176466cbd1b3f0f67709d3023c40dbebcbd Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 26 Feb 2010 10:49:12 -0600 Subject: x86: Enable NMI on all cpus on UV Enable NMI on all cpus in UV system and add an NMI handler to dump_stack on each cpu. By default on x86 all the cpus except the boot cpu have NMI masked off. This patch enables NMI on all cpus in UV system and adds an NMI handler to dump_stack on each cpu. This way if a system hangs we can NMI the machine and get a backtrace from all the cpus. Version 2: Use x86_platform driver mechanism for nmi init, per Ingo's suggestion. Version 3: Clean up Ingo's nits. Signed-off-by: Russ Anderson LKML-Reference: <20100226164912.GA24439@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv.h | 1 + arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/x2apic_uv_x.c | 44 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 3 +++ 5 files changed, 51 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index c0a01b5d985b..3bb9491b7659 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -11,6 +11,7 @@ struct mm_struct; extern enum uv_system_type get_uv_system_type(void); extern int is_uv_system(void); extern void uv_cpu_init(void); +extern void uv_nmi_init(void); extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ea0e8ea15e15..60cc35269083 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -126,6 +126,7 @@ struct x86_cpuinit_ops { * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic + * @nmi_init enable NMI on cpus */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -133,6 +134,7 @@ struct x86_platform_ops { int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); + void (*nmi_init)(void); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 6ef2899eb861..4b8dbb256147 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +static DEFINE_SPINLOCK(uv_nmi_lock); static inline bool is_GRU_range(u64 start, u64 end) { @@ -74,6 +76,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (!strcmp(oem_id, "SGI")) { nodeid = early_get_nodeid(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) @@ -596,6 +599,46 @@ void __cpuinit uv_cpu_init(void) set_x2apic_extra_bits(uv_hub_info->pnode); } +/* + * When NMI is received, print a stack trace. + */ +int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) +{ + if (reason != DIE_NMI_IPI) + return NOTIFY_OK; + /* + * Use a lock so only one cpu prints at a time + * to prevent intermixed output. + */ + spin_lock(&uv_nmi_lock); + pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + dump_stack(); + spin_unlock(&uv_nmi_lock); + + return NOTIFY_STOP; +} + +static struct notifier_block uv_dump_stack_nmi_nb = { + .notifier_call = uv_handle_nmi +}; + +void uv_register_nmi_notifier(void) +{ + if (register_die_notifier(&uv_dump_stack_nmi_nb)) + printk(KERN_WARNING "UV NMI handler failed to register\n"); +} + +void uv_nmi_init(void) +{ + unsigned int value; + + /* + * Unmask NMI on all cpus + */ + value = apic_read(APIC_LVT1) | APIC_DM_NMI; + value &= ~APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} void __init uv_system_init(void) { @@ -717,6 +760,7 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); + uv_register_nmi_notifier(); proc_mkdir("sgi_uv", NULL); /* register Legacy VGA I/O redirection handler */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..838a118876c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -320,6 +320,7 @@ notrace static void __cpuinit start_secondary(void *unused) unlock_vector_lock(); ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + x86_platform.nmi_init(); /* enable local interrupts */ local_irq_enable(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ccd179dec36e..ee5746c94628 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -76,10 +76,13 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, }; +static void default_nmi_init(void) { }; + struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, + .nmi_init = default_nmi_init }; -- cgit v1.2.2 From 21c2fd9970cc8e457058f84016450a2e9876125e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Feb 2010 11:17:16 +0100 Subject: x86: apic: Fix mismerge, add arch_probe_nr_irqs() again Merge commit aef55d4922 mis-merged io_apic.c so we lost the arch_probe_nr_irqs() method. This caused subtle boot breakages (udev confusion likely due to missing drivers) with certain configs. Cc: H. Peter Anvin Cc: Yinghai Lu LKML-Reference: <20100207210250.GB8256@jenkins.home.ifup.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 979589881c80..72ac2a332993 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3876,6 +3876,28 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +#ifdef CONFIG_SPARSE_IRQ +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = nr_irqs_gsi + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) + /* + * for MSI and HT dyn irq + */ + nr += nr_irqs_gsi * 16; +#endif + if (nr < nr_irqs) + nr_irqs = nr; + + return 0; +} +#endif + static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { -- cgit v1.2.2 From 3d083407a16698de86b42aee0da2ffb280b5cb7e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 27 Feb 2010 17:24:15 +0100 Subject: x86/hw-breakpoints: Remove the name field Remove the name field from the arch_hw_breakpoint. We never deal with target symbols in the arch level, neither do we need to ever store it. It's a legacy for the previous version of the x86 breakpoint backend. Let's remove it. Signed-off-by: Frederic Weisbecker Cc: K.Prasad Cc: Linus Torvalds --- arch/x86/include/asm/hw_breakpoint.h | 1 - arch/x86/kernel/hw_breakpoint.c | 7 ------- 2 files changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 0675a7c4c20e..2a1bd8f4f23a 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -10,7 +10,6 @@ * (display/resolving) */ struct arch_hw_breakpoint { - char *name; /* Contains name of the symbol to set bkpt */ unsigned long address; u8 len; u8 type; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index dca2802c666f..41e08dff0161 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -343,13 +343,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. -- cgit v1.2.2 From 817a824b75b1475f1b067c8cee318c7b4d66fcde Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 26 Feb 2010 17:16:00 +0000 Subject: x86, xen: Disable highmem PTE allocation even when CONFIG_HIGHPTE=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a path in the pagefault code where the kernel deliberately breaks its own locking rules by kmapping a high pte page without holding the pagetable lock (in at least page_check_address). This breaks Xen's ability to track the pinned/unpinned state of the page. There does not appear to be a viable workaround for this behaviour so simply disable HIGHPTE for all Xen guests. Signed-off-by: Ian Campbell LKML-Reference: <1267204562-11844-1-git-send-email-ian.campbell@citrix.com> Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Cc: Pasi Kärkkäinen Cc: # .32.x: 14315592: Allow highmem user page tables to be disabled at boot time Cc: # .32.x Cc: Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 7 +++++++ arch/x86/xen/mmu.c | 11 ++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 36daccb68642..b607239c1ba8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -1094,6 +1095,12 @@ asmlinkage void __init xen_start_kernel(void) __supported_pte_mask |= _PAGE_IOMAP; + /* + * Prevent page tables from being allocated in highmem, even + * if CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + /* Work out if we support NX */ x86_configure_nx(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index bf4cd6bfe959..350a3deedf25 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1432,14 +1432,15 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { pgprot_t prot = PAGE_KERNEL; + /* + * We disable highmem allocations for page tables so we should never + * see any calls to kmap_atomic_pte on a highmem page. + */ + BUG_ON(PageHighMem(page)); + if (PagePinned(page)) prot = PAGE_KERNEL_RO; - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - return kmap_atomic_prot(page, type, prot); } #endif -- cgit v1.2.2 From 3249b7e1df6380e9d7bb3238f64f445bf614f787 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 26 Feb 2010 17:16:01 +0000 Subject: x86, vmi: Disable highmem PTE allocation even when CONFIG_HIGHPTE=y Preventing HIGHPTE allocations under VMI will allow us to remove the kmap_atomic_pte paravirt op. Signed-off-by: Ian Campbell LKML-Reference: <1267204562-11844-2-git-send-email-ian.campbell@citrix.com> Acked-by: Alok Kataria Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmi_32.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index d430e4c30193..58aca86193e5 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -272,19 +273,11 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) void *va = kmap_atomic(page, type); /* - * Internally, the VMI ROM must map virtual addresses to physical - * addresses for processing MMU updates. By the time MMU updates - * are issued, this information is typically already lost. - * Fortunately, the VMI provides a cache of mapping slots for active - * page tables. - * - * We use slot zero for the linear mapping of physical memory, and - * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1. - * - * args: SLOT VA COUNT PFN + * We disable highmem allocations for page tables so we should never + * see any calls to kmap_atomic_pte on a highmem page. */ - BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); + + BUG_ON(PageHighmem(page)); return va; } @@ -640,6 +633,12 @@ static inline int __init activate_vmi(void) u64 reloc; const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + /* + * Prevent page tables from being allocated in highmem, even if + * CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + if (call_vrom_func(vmi_rom, vmi_init) != 0) { printk(KERN_ERR "VMI ROM failed to initialize!"); return 0; -- cgit v1.2.2 From dad52fc01161afcb8798c609e009aed4d104927f Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 26 Feb 2010 17:16:02 +0000 Subject: x86, paravirt: Remove kmap_atomic_pte paravirt op. Now that both Xen and VMI disable allocations of PTE pages from high memory this paravirt op serves no further purpose. This effectively reverts ce6234b5 "add kmap_atomic_pte for mapping highpte pages". Signed-off-by: Ian Campbell LKML-Reference: <1267204562-11844-3-git-send-email-ian.campbell@citrix.com> Acked-by: Alok Kataria Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/highmem.h | 4 ---- arch/x86/include/asm/paravirt.h | 9 --------- arch/x86/include/asm/paravirt_types.h | 4 ---- arch/x86/include/asm/pgtable_32.h | 4 ++-- arch/x86/kernel/paravirt.c | 4 ---- arch/x86/kernel/vmi_32.c | 20 -------------------- arch/x86/xen/mmu.c | 22 ---------------------- 7 files changed, 2 insertions(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 014c2b85ae45..a726650fc80f 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); -#ifndef CONFIG_PARAVIRT -#define kmap_atomic_pte(page, type) kmap_atomic(page, type) -#endif - #define flush_cache_kmaps() do { } while (0) extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dd59a85a918f..5653f43d90e5 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn) PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); } -#ifdef CONFIG_HIGHPTE -static inline void *kmap_atomic_pte(struct page *page, enum km_type type) -{ - unsigned long ret; - ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); - return (void *)ret; -} -#endif - static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b1e70d51e40c..db9ef5532341 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -304,10 +304,6 @@ struct pv_mmu_ops { #endif /* PAGETABLE_LEVELS == 4 */ #endif /* PAGETABLE_LEVELS >= 3 */ -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - struct pv_lazy_ops lazy_mode; /* dom0 ops */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 01fd9461d323..b422d2201af3 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -54,10 +54,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); in_irq() ? KM_IRQ_PTE : \ KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ pte_index((address))) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b1739d16310..1db183ed7c01 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, -#endif - #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 58aca86193e5..7dd599deca4a 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -267,22 +267,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_HIGHPTE -static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) -{ - void *va = kmap_atomic(page, type); - - /* - * We disable highmem allocations for page tables so we should never - * see any calls to kmap_atomic_pte on a highmem page. - */ - - BUG_ON(PageHighmem(page)); - - return va; -} -#endif - static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -777,10 +761,6 @@ static inline int __init activate_vmi(void) /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); -#ifdef CONFIG_HIGHPTE - if (vmi_ops.set_linear_mapping) - pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; -#endif /* * These MUST always be patched. Don't support indirect jumps diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 350a3deedf25..f9eb7de74f42 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1427,24 +1427,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - /* - * We disable highmem allocations for page tables so we should never - * see any calls to kmap_atomic_pte on a highmem page. - */ - BUG_ON(PageHighMem(page)); - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - return kmap_atomic_prot(page, type, prot); -} -#endif - #ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { @@ -1903,10 +1885,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pmd_clone = paravirt_nop, .release_pmd = xen_release_pmd_init, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - #ifdef CONFIG_X86_64 .set_pte = xen_set_pte, #else -- cgit v1.2.2 From fad539956c9e69749a03f7817d22d1bab87657bf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 28 Feb 2010 01:06:34 -0800 Subject: x86: Fix out of order of gsi Iranna D Ankad reported that IBM x3950 systems have boot problems after this commit: | | commit b9c61b70075c87a8612624736faf4a2de5b1ed30 | | x86/pci: update pirq_enable_irq() to setup io apic routing | The problem is that with the patch, the machine freezes when console=ttyS0,... kernel serial parameter is passed. It seem to freeze at DVD initialization and the whole problem seem to be DVD/pata related, but somehow exposed through the serial parameter. Such apic problems can expose really weird behavior: ACPI: IOAPIC (id[0x10] address[0xfecff000] gsi_base[0]) IOAPIC[0]: apic_id 16, version 0, address 0xfecff000, GSI 0-2 ACPI: IOAPIC (id[0x0f] address[0xfec00000] gsi_base[3]) IOAPIC[1]: apic_id 15, version 0, address 0xfec00000, GSI 3-38 ACPI: IOAPIC (id[0x0e] address[0xfec01000] gsi_base[39]) IOAPIC[2]: apic_id 14, version 0, address 0xfec01000, GSI 39-74 ACPI: INT_SRC_OVR (bus 0 bus_irq 1 global_irq 4 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 5 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 3 global_irq 6 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 4 global_irq 7 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 6 global_irq 9 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 7 global_irq 10 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 8 global_irq 11 low edge) ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 12 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 12 global_irq 15 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 13 global_irq 16 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 14 global_irq 17 low edge) ACPI: INT_SRC_OVR (bus 0 bus_irq 15 global_irq 18 dfl dfl) It turns out that the system has three io apic controllers, but boot ioapic routing is in the second one, and that gsi_base is not 0 - it is using a bunch of INT_SRC_OVR... So these recent changes: 1. one set routing for first io apic controller 2. assume irq = gsi ... will break that system. So try to remap those gsis, need to seperate boot_ioapic_idx detection out of enable_IO_APIC() and call them early. So introduce boot_ioapic_idx, and remap_ioapic_gsi()... -v2: shift gsi with delta instead of gsi_base of boot_ioapic_idx -v3: double check with find_isa_irq_apic(0, mp_INT) to get right boot_ioapic_idx -v4: nr_legacy_irqs -v5: add print out for boot_ioapic_idx, and also make it could be applied for current kernel and previous kernel -v6: add bus_irq, in acpi_sci_ioapic_setup, so can get overwride for sci right mapping... -v7: looks like pnpacpi get irq instead of gsi, so need to revert them back... -v8: split into two patches -v9: according to Eric, use fixed 16 for shifting instead of remap -v10: still need to touch rsparser.c -v11: just revert back to way Eric suggest... anyway the ioapic in first ioapic is blocked by second... -v12: two patches, this one will add more loop but check apic_id and irq > 16 Reported-by: Iranna D Ankad Bisected-by: Iranna D Ankad Tested-by: Gary Hade Signed-off-by: Yinghai Lu Cc: Eric W. Biederman Cc: Thomas Renninger Cc: Eric W. Biederman Cc: Suresh Siddha Cc: len.brown@intel.com LKML-Reference: <4B8A321A.1000008@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 72ac2a332993..97e1e3ec2edf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1475,7 +1475,7 @@ static struct { static void __init setup_IO_APIC_irqs(void) { - int apic_id = 0, pin, idx, irq; + int apic_id, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; @@ -1483,14 +1483,7 @@ static void __init setup_IO_APIC_irqs(void) apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); -#ifdef CONFIG_ACPI - if (!acpi_disabled && acpi_ioapic) { - apic_id = mp_find_ioapic(0); - if (apic_id < 0) - apic_id = 0; - } -#endif - + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); if (idx == -1) { @@ -1512,6 +1505,9 @@ static void __init setup_IO_APIC_irqs(void) irq = pin_2_irq(idx, apic_id, pin); + if ((apic_id > 0) && (irq > 16)) + continue; + /* * Skip the timer IRQ if there's a quirk handler * installed and if it returns 1: @@ -4105,27 +4101,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic = 0, irq, irq_entry; + int pin, ioapic, irq, irq_entry; struct irq_desc *desc; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; -#ifdef CONFIG_ACPI - if (!acpi_disabled && acpi_ioapic) { - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - ioapic = 0; - } -#endif - + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; irq = pin_2_irq(irq_entry, ioapic, pin); + if ((ioapic > 0) && (irq > 16)) + continue; + desc = irq_to_desc(irq); /* -- cgit v1.2.2 From 1e259e0a9982078896f3404240096cbea01daca4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 28 Feb 2010 20:51:15 +0100 Subject: hw-breakpoints: Remove stub unthrottle callback We support event unthrottling in breakpoint events. It means that if we have more than sysctl_perf_event_sample_rate/HZ, perf will throttle, ignoring subsequent events until the next tick. So if ptrace exceeds this max rate, it will omit events, which breaks the ptrace determinism that is supposed to report every triggered breakpoints. This is likely to happen if we set sysctl_perf_event_sample_rate to 1. This patch removes support for unthrottling in breakpoint events to break throttling and restore ptrace determinism. Signed-off-by: Frederic Weisbecker Cc: 2.6.33.x Cc: Peter Zijlstra Cc: K.Prasad Cc: Paul Mackerras --- arch/x86/kernel/hw_breakpoint.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index bb6006e3e295..1e8ceadc0d6a 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -531,8 +531,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } - -void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) -{ - /* TODO */ -} -- cgit v1.2.2 From 1d6040f17d12a65b9f7ab4cb9fd6d721206b79ec Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 25 Feb 2010 19:40:46 +0100 Subject: perf, x86: make IBS macros available in perf_event.h This patch moves code from oprofile to perf_event.h to make it also available for usage by perf. Signed-off-by: Robert Richter --- arch/x86/include/asm/perf_event.h | 10 ++++++++++ arch/x86/oprofile/op_model_amd.c | 11 ----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index befd172c82ad..4933ccde96c4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -117,6 +117,16 @@ union cpuid10_edx { */ #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL + +/* IbsOpCtl bits */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) #ifdef CONFIG_PERF_EVENTS extern void init_hw_perf_events(void); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 6a58256dce9f..c67174917305 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -46,17 +46,6 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; -/* IbsFetchCtl bits/masks */ -#define IBS_FETCH_RAND_EN (1ULL<<57) -#define IBS_FETCH_VAL (1ULL<<49) -#define IBS_FETCH_ENABLE (1ULL<<48) -#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL - -/* IbsOpCtl bits */ -#define IBS_OP_CNT_CTL (1ULL<<19) -#define IBS_OP_VAL (1ULL<<18) -#define IBS_OP_ENABLE (1ULL<<17) - #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 -- cgit v1.2.2 From a163b1099dc7016704043c7fc572ae42519f08f7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 25 Feb 2010 19:43:07 +0100 Subject: perf, x86: add some IBS macros to perf_event.h Signed-off-by: Robert Richter --- arch/x86/include/asm/perf_event.h | 4 +++- arch/x86/oprofile/op_model_amd.c | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 4933ccde96c4..c7f60e1297ab 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -121,12 +121,14 @@ union cpuid10_edx { #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) -#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL +#define IBS_FETCH_CNT 0xFFFF0000ULL +#define IBS_FETCH_MAX_CNT 0x0000FFFFULL /* IbsOpCtl bits */ #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_MAX_CNT 0x0000FFFFULL #ifdef CONFIG_PERF_EVENTS extern void init_hw_perf_events(void); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index c67174917305..8ddb9fa9c1b2 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -279,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); ctl |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } @@ -319,7 +319,7 @@ static inline void op_amd_start_ibs(void) return; if (ibs_config.fetch_enabled) { - val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); @@ -341,7 +341,7 @@ static inline void op_amd_start_ibs(void) * avoid underflows. */ ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, - 0xFFFFULL); + IBS_OP_MAX_CNT); } if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) ibs_op_ctl |= IBS_OP_CNT_CTL; -- cgit v1.2.2 From 339d3261aa3eb0e12f68ef868e042c1ca03628f7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 6 Feb 2010 09:42:39 +0100 Subject: x86/amd-iommu: Remove double NULL check in check_device dev was tested just above, so drop the second test. Signed-off-by: Julia Lawall Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index adb0ba025702..2c4a5012038e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -118,7 +118,7 @@ static bool check_device(struct device *dev) return false; /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) + if (dev->bus != &pci_bus_type) return false; devid = get_device_id(dev); -- cgit v1.2.2 From 5d214fe6e808a8caa9cb6f610c0190d3f50ac570 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Feb 2010 14:44:49 +0100 Subject: x86/amd-iommu: Protect IOMMU-API map/unmap path This patch introduces a mutex to lock page table updates in the IOMMU-API path. We can't use the spin_lock here because this patch might sleep. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 2 ++ arch/x86/kernel/amd_iommu.c | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index ba19ad4c47d0..5e46e78f3b1b 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -21,6 +21,7 @@ #define _ASM_X86_AMD_IOMMU_TYPES_H #include +#include #include #include @@ -237,6 +238,7 @@ struct protection_domain { struct list_head list; /* for list of all protection domains */ struct list_head dev_list; /* List of all devices in this domain */ spinlock_t lock; /* mostly used to lock the page table*/ + struct mutex api_lock; /* protect page tables in the iommu-api path */ u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ u64 *pt_root; /* page table root pointer */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 2c4a5012038e..b97f2f1c449a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2327,6 +2327,7 @@ static struct protection_domain *protection_domain_alloc(void) return NULL; spin_lock_init(&domain->lock); + mutex_init(&domain->api_lock); domain->id = domain_id_alloc(); if (!domain->id) goto out_err; @@ -2456,6 +2457,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom, iova &= PAGE_MASK; paddr &= PAGE_MASK; + mutex_lock(&domain->api_lock); + for (i = 0; i < npages; ++i) { ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); if (ret) @@ -2465,6 +2468,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom, paddr += PAGE_SIZE; } + mutex_unlock(&domain->api_lock); + return 0; } @@ -2477,12 +2482,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova &= PAGE_MASK; + mutex_lock(&domain->api_lock); + for (i = 0; i < npages; ++i) { iommu_unmap_page(domain, iova, PM_MAP_4k); iova += PAGE_SIZE; } iommu_flush_tlb_pde(domain); + + mutex_unlock(&domain->api_lock); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, -- cgit v1.2.2 From 04e856c072b84042bb56c487c2868638bb3f78db Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Wed, 17 Feb 2010 08:51:20 -0800 Subject: x86/amd-iommu: Pt mode fix for domain_destroy After a guest is shutdown, assigned devices are not properly returned to the pt domain. This can leave the device using stale cached IOMMU data, and result in a non-functional device after it's re-bound to the host driver. For example, I see this upon rebinding: AMD-Vi: Event logged [IO_PAGE_FAULT device=02:00.0 domain=0x0000 address=0x000000007e2a8000 flags=0x0050] AMD-Vi: Event logged [IO_PAGE_FAULT device=02:00.0 domain=0x0000 address=0x000000007e2a8040 flags=0x0050] AMD-Vi: Event logged [IO_PAGE_FAULT device=02:00.0 domain=0x0000 address=0x000000007e2a8080 flags=0x0050] AMD-Vi: Event logged [IO_PAGE_FAULT device=02:00.0 domain=0x0000 address=0x000000007e2a80c0 flags=0x0050] 0000:02:00.0: eth2: Detected Hardware Unit Hang: ... The amd_iommu_destroy_domain() function calls do_detach() which doesn't reattach the pt domain to the device. Use __detach_device() instead. Cc: stable@kernel.org Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b97f2f1c449a..0c0425436a73 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2298,7 +2298,7 @@ static void cleanup_domain(struct protection_domain *domain) list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { struct device *dev = dev_data->dev; - do_detach(dev); + __detach_device(dev); atomic_set(&dev_data->bind, 0); } -- cgit v1.2.2 From 3551a708f35fc712af43aeb7f541512c5cfc4936 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Mar 2010 13:52:19 +0100 Subject: x86/amd-iommu: Report errors in acpi parsing functions upstream Since acpi_table_parse ignores the return values of the parsing function this patch introduces a workaround and reports these errors upstream via a global variable. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9dc91b431470..feaf47184900 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -138,9 +138,9 @@ int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; /* - * Set to true if ACPI table parsing and hardware intialization went properly + * The ACPI table parsing functions set this variable on an error */ -static bool amd_iommu_initialized; +static int __initdata amd_iommu_init_err; /* * List of protection domains - used during resume @@ -391,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) */ for (i = 0; i < table->length; ++i) checksum += p[i]; - if (checksum != 0) + if (checksum != 0) { /* ACPI table corrupt */ - return -ENODEV; + amd_iommu_init_err = -ENODEV; + return 0; + } p += IVRS_HEADER_LENGTH; @@ -920,11 +922,16 @@ static int __init init_iommu_all(struct acpi_table_header *table) h->mmio_phys); iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); - if (iommu == NULL) - return -ENOMEM; + if (iommu == NULL) { + amd_iommu_init_err = -ENOMEM; + return 0; + } + ret = init_iommu_one(iommu, h); - if (ret) - return ret; + if (ret) { + amd_iommu_init_err = ret; + return 0; + } break; default: break; @@ -934,8 +941,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) } WARN_ON(p != end); - amd_iommu_initialized = true; - return 0; } @@ -1211,6 +1216,10 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; + ret = amd_iommu_init_err; + if (ret) + goto out; + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); @@ -1270,12 +1279,19 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; - if (!amd_iommu_initialized) + if (amd_iommu_init_err) { + ret = amd_iommu_init_err; goto free; + } if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; + if (amd_iommu_init_err) { + ret = amd_iommu_init_err; + goto free; + } + ret = sysdev_class_register(&amd_iommu_sysdev_class); if (ret) goto free; -- cgit v1.2.2 From bb1165d6882f423f90fc7007a88c6c993b7c2ac4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 1 Mar 2010 14:21:23 +0100 Subject: perf, x86: rename macro in ARCH_PERFMON_EVENTSEL_ENABLE For consistency reasons this patch renames ARCH_PERFMON_EVENTSEL0_ENABLE to ARCH_PERFMON_EVENTSEL_ENABLE. The following is performed: $ sed -i -e s/ARCH_PERFMON_EVENTSEL0_ENABLE/ARCH_PERFMON_EVENTSEL_ENABLE/g \ arch/x86/include/asm/perf_event.h arch/x86/kernel/cpu/perf_event.c \ arch/x86/kernel/cpu/perf_event_p6.c \ arch/x86/kernel/cpu/perfctr-watchdog.c \ arch/x86/oprofile/op_model_amd.c arch/x86/oprofile/op_model_ppro.c Signed-off-by: Robert Richter --- arch/x86/include/asm/perf_event.h | 2 +- arch/x86/kernel/cpu/perf_event.c | 8 ++++---- arch/x86/kernel/cpu/perf_event_p6.c | 8 ++++---- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- arch/x86/oprofile/op_model_amd.c | 6 +++--- arch/x86/oprofile/op_model_ppro.c | 6 +++--- 6 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index c7f60e1297ab..80e693684f18 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -18,7 +18,7 @@ #define MSR_ARCH_PERFMON_EVENTSEL0 0x186 #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22) #define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) #define ARCH_PERFMON_EVENTSEL_INT (1 << 20) #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9dddbc..6531b4bdb22d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -553,9 +553,9 @@ static void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(x86_pmu.eventsel + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -590,7 +590,7 @@ static void x86_pmu_enable_all(void) continue; val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -853,7 +853,7 @@ void hw_perf_enable(void) static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); + hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); } static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 1ca5ba078afd..a4e67b99d91c 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -62,7 +62,7 @@ static void p6_pmu_disable_all(void) /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(MSR_P6_EVNTSEL0, val); } @@ -72,7 +72,7 @@ static void p6_pmu_enable_all(void) /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(MSR_P6_EVNTSEL0, val); } @@ -83,7 +83,7 @@ p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) u64 val = P6_NOP_EVENT; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)checking_wrmsrl(hwc->config_base + idx, val); } @@ -95,7 +95,7 @@ static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) val = hwc->config; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)checking_wrmsrl(hwc->config_base + idx, val); } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 74f4e85a5727..fb329e9f8494 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -680,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) cpu_nmi_set_wd_enabled(); apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8ddb9fa9c1b2..090cbbec7dbd 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -171,7 +171,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, continue; } rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); @@ -398,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs) if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } @@ -418,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 5d1727ba409e..2bf90fafa7b5 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -88,7 +88,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, continue; } rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); @@ -166,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } @@ -184,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs) if (!reset_value[i]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } -- cgit v1.2.2 From 3fd28fce765632d0fe46b31f63c0e7a7ec6c6b79 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Thu, 19 Nov 2009 17:54:07 +0200 Subject: KVM: x86: make double/triple fault promotion generic to all exceptions Move Double-Fault generation logic out of page fault exception generating function to cover more generic case. Signed-off-by: Eddie Dong Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 89 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1e1bc9d412d..8d860e0301a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -257,12 +257,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code) +{ + u32 prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); @@ -270,25 +326,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; - - if (vcpu->arch.exception.pending) { - switch(vcpu->arch.exception.nr) { - case DF_VECTOR: - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - return; - case PF_VECTOR: - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - default: - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - vcpu->arch.exception.pending = false; - break; - } - } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -301,11 +338,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -- cgit v1.2.2 From cb84b55f6cde26c7c17beaf87da08645ae6ccbf9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 11 Nov 2009 17:29:49 -0200 Subject: KVM: x86: raise TSS exception for NULL CS and SS segments Windows 2003 uses task switch to triple fault and reboot (the other exception being reserved pdptrs bits). Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d860e0301a0..279318677911 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4410,6 +4410,15 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } +static void kvm_check_segment_descriptor(struct kvm_vcpu *vcpu, int seg, + u16 selector) +{ + /* NULL selector is not valid for CS and SS */ + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) + if (!selector) + kvm_queue_exception_e(vcpu, TS_VECTOR, selector >> 3); +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { @@ -4419,6 +4428,8 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; + + kvm_check_segment_descriptor(vcpu, seg, selector); kvm_seg.type |= type_bits; if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && -- cgit v1.2.2 From 186a3e526ac1b4063a723f90ae4893beedb24fc6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 Dec 2009 15:17:00 +0200 Subject: KVM: MMU: Report spte not found in rmap before BUG() In the past we've had errors of single-bit in the other two cases; the printk() may confirm it for the third case (many->many). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89a49fb46a27..4f499d7f7106 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -662,6 +662,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } + pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); BUG(); } } -- cgit v1.2.2 From 59708670b639bff00f92e519df1ae14da240e919 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 15 Dec 2009 13:29:54 +0800 Subject: KVM: VMX: Trap and invalid MWAIT/MONITOR instruction We don't support these instructions, but guest can execute them even if the feature('monitor') haven't been exposed in CPUID. So we would trap and inject a #UD if guest try this way. Cc: stable@kernel.org Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b4945419a84..8f6b0111446a 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -251,6 +251,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4918d6fc924..8a8e13965076 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1224,6 +1224,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | @@ -3416,6 +3418,12 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -3453,6 +3461,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, }; static const int kvm_vmx_max_exit_handlers = -- cgit v1.2.2 From cdc0e24456bf5678f63497569c3676c9019f82c1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Dec 2009 17:21:14 +0200 Subject: KVM: VMX: Move some cr[04] related constants to vmx.c They have no place in common code. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 13 ------------- arch/x86/kvm/vmx.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f865e8b8540..da6dee862763 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,19 +38,6 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8a8e13965076..efbb614ccd36 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,6 +61,19 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_GUEST_CR4_MASK \ + (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive -- cgit v1.2.2 From fc78f51938e1ea866daa2045851b2e5681371668 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Dec 2009 12:16:48 +0200 Subject: KVM: Add accessor for reading cr4 (or some bits of cr4) Some bits of cr4 can be owned by the guest on vmx, so when we read them, we copy them to the vcpu structure. In preparation for making the set of guest-owned bits dynamic, use helpers to access these bits so we don't need to know where the bit resides. No changes to svm since all bits are host-owned there. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/kvm_cache_regs.h | 12 ++++++++++++ arch/x86/kvm/mmu.h | 5 +++-- arch/x86/kvm/vmx.c | 13 ++++++++----- arch/x86/kvm/x86.c | 16 ++++++---------- 5 files changed, 30 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da6dee862763..e9f4f12ec3c4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -272,6 +272,7 @@ struct kvm_vcpu_arch { unsigned long cr2; unsigned long cr3; unsigned long cr4; + unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; u64 pdptrs[4]; /* pae */ diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6a4403..35acc36e1782 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -38,4 +38,16 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + if (mask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b3884b49..4567d8042b22 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -64,12 +65,12 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) static inline int is_pae(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr4 & X86_CR4_PAE; + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); } static inline int is_pse(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr4 & X86_CR4_PSE; + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); } static inline int is_paging(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index efbb614ccd36..284e905c59d3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1615,8 +1615,10 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1661,7 +1663,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1669,7 +1671,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) @@ -2420,6 +2422,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + vmx->vcpu.arch.cr4_guest_owned_bits = ~KVM_GUEST_CR4_MASK; tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); @@ -3050,7 +3053,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) vcpu->arch.eff_db[dr] = val; break; case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) kvm_queue_exception(vcpu, UD_VECTOR); break; case 6: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 279318677911..84dd33e717fd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -482,7 +482,7 @@ EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { @@ -1899,7 +1899,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !(vcpu->arch.cr4 & X86_CR4_MCE)) { + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); @@ -3616,7 +3616,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { unsigned long value; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: value = vcpu->arch.cr0; @@ -3628,7 +3627,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) value = vcpu->arch.cr3; break; case 4: - value = vcpu->arch.cr4; + value = kvm_read_cr4(vcpu); break; case 8: value = kvm_get_cr8(vcpu); @@ -3656,7 +3655,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: kvm_set_cr8(vcpu, val & 0xfUL); @@ -4237,11 +4236,10 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); sregs->cr0 = vcpu->arch.cr0; sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); @@ -4737,13 +4735,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.cr3); -- cgit v1.2.2 From 4c38609ac569483152f9cb7e5a66f17355e563b2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Dec 2009 12:26:18 +0200 Subject: KVM: VMX: Make guest cr4 mask more conservative Instead of specifying the bits which we want to trap on, specify the bits which we allow the guest to change transparently. This is safer wrt future changes to cr4. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 284e905c59d3..755811a564b3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -69,8 +69,10 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT) + #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -2421,8 +2423,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - vmx->vcpu.arch.cr4_guest_owned_bits = ~KVM_GUEST_CR4_MASK; + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); -- cgit v1.2.2 From ce03e4f21a33b97c5d20ce597f64c361bb247904 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Dec 2009 12:29:14 +0200 Subject: KVM: VMX: When using ept, allow the guest to own cr4.pge We make no use of cr4.pge if ept is enabled, but the guest does (to flush global mappings, as with vmap()), so give the guest ownership of this bit. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 755811a564b3..a7ebaa5f2b65 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2424,6 +2424,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; -- cgit v1.2.2 From bc23008b610dda1b0e69cd473b31c1391d6873f2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 8 Dec 2009 12:14:42 +0200 Subject: KVM: VMX: Fold ept_update_paging_mode_cr4() into its caller ept_update_paging_mode_cr4() accesses vcpu->arch.cr4 directly, which usually needs to be accessed via kvm_read_cr4(). In this case, we can't, since cr4 is in the process of being updated. Instead of adding inane comments, fold the function into its caller (vmx_set_cr4), so it can use the not-yet-committed cr4 directly. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a7ebaa5f2b65..694baed9bc8f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1680,16 +1680,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, *hw_cr0 &= ~X86_CR0_WP; } -static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, - struct kvm_vcpu *vcpu) -{ - if (!is_paging(vcpu)) { - *hw_cr4 &= ~X86_CR4_PAE; - *hw_cr4 |= X86_CR4_PSE; - } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) - *hw_cr4 &= ~X86_CR4_PAE; -} - static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1767,8 +1757,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (enable_ept) - ept_update_paging_mode_cr4(&hw_cr4, vcpu); + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); -- cgit v1.2.2 From 8a7e3f01e692cd202fb7c042cf2be9ff8c599a1e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:42 +0800 Subject: KVM: VMX: Remove redundant variable It's no longer necessary. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 694baed9bc8f..f8f2fdc26894 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2399,14 +2399,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; - u64 data; int j = vmx->nmsrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - data = data_low | ((u64)data_high << 32); vmx->guest_msrs[j].index = i; vmx->guest_msrs[j].data = 0; vmx->guest_msrs[j].mask = -1ull; -- cgit v1.2.2 From 2bf78fa7b9b0d2917fd6587eadb3c0f6bbaf1718 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:44 +0800 Subject: KVM: Extended shared_msr_global to per CPU shared_msr_global saved host value of relevant MSRs, but it have an assumption that all MSRs it tracked shared the value across the different CPUs. It's not true with some MSRs, e.g. MSR_TSC_AUX. Extend it to per CPU to provide the support of MSR_TSC_AUX, and more alike MSRs. Notice now the shared_msr_global still have one assumption: it can only deal with the MSRs that won't change in host after KVM module loaded. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84dd33e717fd..4e7bbc49b7e4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -93,16 +93,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); struct kvm_shared_msrs_global { int nr; - struct kvm_shared_msr { - u32 msr; - u64 value; - } msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_NR_SHARED_MSRS]; }; struct kvm_shared_msrs { struct user_return_notifier urn; bool registered; - u64 current_value[KVM_NR_SHARED_MSRS]; + struct kvm_shared_msr_values { + u64 host; + u64 curr; + } values[KVM_NR_SHARED_MSRS]; }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; @@ -147,53 +147,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msr *global; struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); + struct kvm_shared_msr_values *values; for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - global = &shared_msrs_global.msrs[slot]; - if (global->value != locals->current_value[slot]) { - wrmsrl(global->msr, global->value); - locals->current_value[slot] = global->value; + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; } } locals->registered = false; user_return_notifier_unregister(urn); } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static void shared_msr_update(unsigned slot, u32 msr) { - int cpu; + struct kvm_shared_msrs *smsr; u64 value; + smsr = &__get_cpu_var(shared_msrs); + /* only read, and nobody should modify it at this time, + * so don't need lock */ + if (slot >= shared_msrs_global.nr) { + printk(KERN_ERR "kvm: invalid MSR slot!"); + return; + } + rdmsrl_safe(msr, &value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot].msr = msr; - rdmsrl_safe(msr, &value); - shared_msrs_global.msrs[slot].value = value; - for_each_online_cpu(cpu) - per_cpu(shared_msrs, cpu).current_value[slot] = value; + shared_msrs_global.msrs[slot] = msr; + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { unsigned i; - struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); for (i = 0; i < shared_msrs_global.nr; ++i) - locals->current_value[i] = shared_msrs_global.msrs[i].value; + shared_msr_update(i, shared_msrs_global.msrs[i]); } void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - if (((value ^ smsr->current_value[slot]) & mask) == 0) + if (((value ^ smsr->values[slot].curr) & mask) == 0) return; - smsr->current_value[slot] = value; - wrmsrl(shared_msrs_global.msrs[slot].msr, value); + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); -- cgit v1.2.2 From be43f83dada2cf0e9e01c9a0ba42977c5bd70f9d Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:45 +0800 Subject: x86: Raise vsyscall priority on hotplug notifier chain KVM need vsyscall_init() to initialize MSR_TSC_AUX before it read the value. Per Avi's suggestion, this patch raised vsyscall priority on hotplug notifier chain, to 30. CC: Ingo Molnar CC: linux-kernel@vger.kernel.org Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kernel/vsyscall_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e5872ff0..1c0c6ab9c60f 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -301,7 +301,8 @@ static int __init vsyscall_init(void) register_sysctl_table(kernel_root_table2); #endif on_each_cpu(cpu_vsyscall_init, NULL, 1); - hotcpu_notifier(cpu_vsyscall_notifier, 0); + /* notifier priority > KVM */ + hotcpu_notifier(cpu_vsyscall_notifier, 30); return 0; } -- cgit v1.2.2 From 0e85188049afacdfce9c026144142264981bbabb Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:46 +0800 Subject: KVM: Add cpuid_update() callback to kvm_x86_ops Sometime, we need to adjust some state in order to reflect guest CPUID setting, e.g. if we don't expose rdtscp to guest, we won't want to enable it on hardware. cpuid_update() is introduced for this purpose. Also export kvm_find_cpuid_entry() for later use. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 3 +++ 4 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e9f4f12ec3c4..7ff0ea371e3c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -459,6 +459,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d9b33843c80..41777e6d9761 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2852,6 +2852,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static const struct trace_print_flags svm_exit_reasons_str[] = { { SVM_EXIT_READ_CR0, "read_cr0" }, { SVM_EXIT_READ_CR3, "read_cr3" }, @@ -2976,6 +2980,8 @@ static struct kvm_x86_ops svm_x86_ops = { .exit_reasons_str = svm_exit_reasons_str, .gb_page_enable = svm_gb_page_enable, + + .cpuid_update = svm_cpuid_update, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f8f2fdc26894..75e8931e96c7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3988,6 +3988,10 @@ static bool vmx_gb_page_enable(void) return false; } +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -4052,6 +4056,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .exit_reasons_str = vmx_exit_reasons_str, .gb_page_enable = vmx_gb_page_enable, + + .cpuid_update = vmx_cpuid_update, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e7bbc49b7e4..e5ac21f992f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1574,6 +1574,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); r = 0; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); out_free: vfree(cpuid_entries); @@ -1596,6 +1597,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); return 0; out: @@ -3733,6 +3735,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { -- cgit v1.2.2 From 4e47c7a6d714cf352b719db92a924b6ec487acc5 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:47 +0800 Subject: KVM: VMX: Add instruction rdtscp support for guest Before enabling, execution of "rdtscp" in guest would result in #UD. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/svm.c | 7 +++++ arch/x86/kvm/vmx.c | 60 ++++++++++++++++++++++++++++++++++++++--- arch/x86/kvm/x86.c | 3 ++- 5 files changed, 68 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7ff0ea371e3c..fe4df464fb39 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -521,6 +521,7 @@ struct kvm_x86_ops { int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); bool (*gb_page_enable)(void); + bool (*rdtscp_supported)(void); const struct trace_print_flags *exit_reasons_str; }; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 8f6b0111446a..713ed9a5b1d3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -53,6 +53,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41777e6d9761..7f4e225feebf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2914,6 +2914,11 @@ static bool svm_gb_page_enable(void) return true; } +static bool svm_rdtscp_supported(void) +{ + return false; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2982,6 +2987,8 @@ static struct kvm_x86_ops svm_x86_ops = { .gb_page_enable = svm_gb_page_enable, .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 75e8931e96c7..74a66f0c00b4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -151,6 +151,8 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + bool rdtscp_enabled; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -225,7 +227,7 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -362,6 +364,12 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID; } +static inline int cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + static inline int cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -893,6 +901,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -928,6 +941,9 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. @@ -1017,6 +1033,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_TSC_AUX: + if (!to_vmx(vcpu)->rdtscp_enabled) + return 1; + /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); @@ -1080,7 +1100,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - /* Otherwise falls through to kvm_set_msr_common */ + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1260,7 +1288,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING; + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_RDTSCP; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3988,8 +4017,31 @@ static bool vmx_gb_page_enable(void) return false; } +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { + struct kvm_cpuid_entry2 *best; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmx->rdtscp_enabled = false; + if (vmx_rdtscp_supported()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (exec_control & SECONDARY_EXEC_RDTSCP) { + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + } + } } static struct kvm_x86_ops vmx_x86_ops = { @@ -4058,6 +4110,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .gb_page_enable = vmx_gb_page_enable, .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5ac21f992f0..8798504ace11 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1646,6 +1646,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #else unsigned f_lm = 0; #endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -1665,7 +1666,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = -- cgit v1.2.2 From 953899b659adce62cbe83d6a7527550ab8797c48 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Dec 2009 12:22:20 +0100 Subject: KVM: SVM: Adjust tsc_offset only if tsc_unstable The tsc_offset adjustment in svm_vcpu_load is executed unconditionally even if Linux considers the host tsc as stable. This causes a Linux guest detecting an unstable tsc in any case. This patch removes the tsc_offset adjustment if the host tsc is stable. The guest will now get the benefit of a stable tsc too. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7f4e225feebf..b373ae6fb974 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -765,14 +765,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { u64 delta; - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; -- cgit v1.2.2 From 50eb2a3cd0f50d912b26d0b79b7f443344608390 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 20 Dec 2009 15:00:10 +0200 Subject: KVM: Add KVM_MMIO kconfig item s390 doesn't have mmio, this will simplify ifdefing it out. Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4cd498332466..06871111bf54 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select USER_RETURN_NOTIFIER + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent -- cgit v1.2.2 From 46a26bf55714c1e2f17e34683292a389acb8e601 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:16 -0200 Subject: KVM: modify memslots layout in struct kvm Have a pointer to an allocated region inside struct kvm. [alex: fix ppc book 3s] Signed-off-by: Alexander Graf Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 11 ++++++----- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4f499d7f7106..81f84d326a84 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -807,13 +807,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, { int i, j; int retval = 0; + struct kvm_memslots *slots = kvm->memslots; /* * If mmap_sem isn't taken, we can look the memslots with only * the mmu_lock by skipping over the slots with userspace_addr == 0. */ - for (i = 0; i < kvm->nmemslots; i++) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; @@ -3021,8 +3022,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) unsigned int nr_mmu_pages; unsigned int nr_pages = 0; - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; + for (i = 0; i < kvm->memslots->nmemslots; i++) + nr_pages += kvm->memslots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3295,7 +3296,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) int i, j, k; for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_memory_slot *m = &vcpu->kvm->memslots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 74a66f0c00b4..18698799e365 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1503,8 +1503,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; + gfn_t base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8798504ace11..3b81cb9da8b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2427,7 +2427,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; memset(memslot->dirty_bitmap, 0, n); } @@ -5223,7 +5223,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, int user_alloc) { int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + struct kvm_memory_slot *memslot = &kvm->memslots->memslots[mem->slot]; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. -- cgit v1.2.2 From fef9cce0eb28a67e688a411cc30b73625e49002b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:17 -0200 Subject: KVM: modify alias layout in x86s struct kvm_arch Have a pointer to an allocated region inside x86's kvm_arch. Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 8 ++++++-- arch/x86/kvm/x86.c | 21 ++++++++++++++++----- 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fe4df464fb39..7cdcb3d0f770 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -370,9 +370,13 @@ struct kvm_mem_alias { gfn_t target_gfn; }; -struct kvm_arch{ - int naliases; +struct kvm_mem_aliases { struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; + int naliases; +}; + +struct kvm_arch { + struct kvm_mem_aliases *aliases; unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b81cb9da8b8..1ce833191430 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2227,9 +2227,10 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases = kvm->arch.aliases; - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; if (gfn >= alias->base_gfn && gfn < alias->base_gfn + alias->npages) return alias->target_gfn + gfn - alias->base_gfn; @@ -2247,6 +2248,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases; r = -EINVAL; /* General sanity checks */ @@ -2266,15 +2268,17 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, down_write(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); - p = &kvm->arch.aliases[alias->slot]; + aliases = kvm->arch.aliases; + + p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) + if (aliases->aliases[n - 1].npages) break; - kvm->arch.naliases = n; + aliases->naliases = n; spin_unlock(&kvm->mmu_lock); kvm_mmu_zap_all(kvm); @@ -5158,6 +5162,12 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5214,6 +5224,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + kfree(kvm->arch.aliases); kfree(kvm); } -- cgit v1.2.2 From f7784b8ec9b6a041fa828cfbe9012fe51933f5ac Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:18 -0200 Subject: KVM: split kvm_arch_set_memory_region into prepare and commit Required for SRCU convertion later. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ce833191430..43da65feed49 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5228,13 +5228,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm); } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots->memslots[mem->slot]; + int npages = memslot->npages; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5254,26 +5254,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); - /* set userspace_addr atomically for kvm_hva_to_rmapp */ - spin_lock(&kvm->mmu_lock); memslot->userspace_addr = userspace_addr; - spin_unlock(&kvm->mmu_lock); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } } } + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -5282,8 +5291,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) -- cgit v1.2.2 From bc6678a33d9b952981a8e44a4f876c3ad64ca4d8 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:21 -0200 Subject: KVM: introduce kvm->srcu and convert kvm_set_memory_region to SRCU update Use two steps for memslot deletion: mark the slot invalid (which stops instantiation of new shadow pages for that slot, but allows destruction), then instantiate the new empty slot. Also simplifies kvm_handle_hva locking. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 28 ++++++++++++++-------------- arch/x86/kvm/vmx.c | 6 +++++- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 81f84d326a84..f8bf42a25995 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -807,21 +808,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, { int i, j; int retval = 0; - struct kvm_memslots *slots = kvm->memslots; + struct kvm_memslots *slots; + + slots = rcu_dereference(kvm->memslots); - /* - * If mmap_sem isn't taken, we can look the memslots with only - * the mmu_lock by skipping over the slots with userspace_addr == 0. - */ for (i = 0; i < slots->nmemslots; i++) { struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; - /* mmu_lock protects userspace_addr */ - if (!start) - continue; - end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; @@ -1617,7 +1612,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(__pa(pte)); __set_bit(slot, sp->slot_bitmap); @@ -3021,9 +3016,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; + struct kvm_memslots *slots; - for (i = 0; i < kvm->memslots->nmemslots; i++) - nr_pages += kvm->memslots->memslots[i].npages; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3293,10 +3290,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu) static int count_rmaps(struct kvm_vcpu *vcpu) { int nmaps = 0; - int i, j, k; + int i, j, k, idx; + idx = srcu_read_lock(&kvm->srcu); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots->memslots[i]; + struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { @@ -3319,6 +3318,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) } } } + srcu_read_unlock(&kvm->srcu, idx); return nmaps; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 18698799e365..f1cae7d6113d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1503,7 +1503,11 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots->memslots[0].base_gfn + + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = rcu_dereference(kvm->memslots); + base_gfn = kvm->memslots->memslots[0].base_gfn + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } -- cgit v1.2.2 From b050b015abbef8225826eecb6f6b4d4a6dea7b79 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:22 -0200 Subject: KVM: use SRCU for dirty log Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43da65feed49..e7488350ca16 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2415,27 +2415,60 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - int n; + int r, n, i; struct kvm_memory_slot *memslot; - int is_dirty = 0; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; down_write(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) goto out; + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); + + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); } + r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: up_write(&kvm->slots_lock); return r; -- cgit v1.2.2 From a983fb238728e1123177e8058d4f644b949a7d05 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:23 -0200 Subject: KVM: x86: switch kvm_set_memory_alias to SRCU update Using a similar two-step procedure as for memslots. Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 4 +++ arch/x86/kvm/x86.c | 60 ++++++++++++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7cdcb3d0f770..6c8c7c578c46 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -368,8 +368,12 @@ struct kvm_mem_alias { gfn_t base_gfn; unsigned long npages; gfn_t target_gfn; +#define KVM_ALIAS_INVALID 1UL + unsigned long flags; }; +#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION + struct kvm_mem_aliases { struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; int naliases; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e7488350ca16..28127c936c3b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -2223,11 +2224,32 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases = kvm->arch.aliases; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); for (i = 0; i < aliases->naliases; ++i) { alias = &aliases->aliases[i]; @@ -2248,7 +2270,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; - struct kvm_mem_aliases *aliases; + struct kvm_mem_aliases *aliases, *old_aliases; r = -EINVAL; /* General sanity checks */ @@ -2265,28 +2287,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + down_write(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); - aliases = kvm->arch.aliases; + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); for (n = KVM_ALIAS_SLOTS; n > 0; --n) if (aliases->aliases[n - 1].npages) break; aliases->naliases = n; - spin_unlock(&kvm->mmu_lock); - kvm_mmu_zap_all(kvm); + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; +out_unlock: up_write(&kvm->slots_lock); - - return 0; - out: return r; } -- cgit v1.2.2 From e93f8a0f821e290ac5149830110a5f704db7a1fc Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:24 -0200 Subject: KVM: convert io_bus to SRCU Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 6 +++--- arch/x86/kvm/i8259.c | 4 +++- arch/x86/kvm/x86.c | 13 +++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 15578f180e59..4b433de02e5b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); if (ret < 0) goto fail_unregister; @@ -660,7 +660,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return pit; fail_unregister: - __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: if (pit->irq_source_id >= 0) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0cbd245..b7d145b20953 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -533,7 +533,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + down_write(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + up_write(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28127c936c3b..9b42673df4af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2851,7 +2851,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2860,7 +2860,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, @@ -3345,11 +3345,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } @@ -3360,7 +3361,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu) int i, r = 0; for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, io->port, io->size, pd)) { r = -EOPNOTSUPP; break; -- cgit v1.2.2 From f656ce0185cabbbb0cf96877306879661297c7ad Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:25 -0200 Subject: KVM: switch vcpu context to use SRCU Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 7 +++---- arch/x86/kvm/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 43 ++++++++++++++++++++++++------------------- 3 files changed, 30 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f8bf42a25995..25aabd00aa01 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2933,10 +2933,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages; + int npages, idx; - if (!down_read_trylock(&kvm->slots_lock)) - continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -2949,7 +2948,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f1cae7d6113d..22ab7137d1d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2478,10 +2478,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; + int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2589,7 +2589,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b42673df4af..53bc06a68105 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1306,15 +1306,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i; + int i, idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); @@ -3900,14 +3900,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu) static void vapic_exit(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + int idx; if (!apic || !apic->vapic_addr) return; - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -4036,7 +4037,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_to_vapic(vcpu); } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -4078,7 +4079,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -4100,6 +4101,7 @@ out: static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; + struct kvm *kvm = vcpu->kvm; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", @@ -4111,7 +4113,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; @@ -4119,9 +4121,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -4156,13 +4158,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ++vcpu->stat.signal_exits; } if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -4201,10 +4203,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, EMULTYPE_NO_DECODE); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -4967,11 +4969,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { unsigned long vaddr = tr->linear_address; gpa_t gpa; + int idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -5223,11 +5226,13 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + int idx; + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } -- cgit v1.2.2 From 79fac95ecfa3969aab8119d37ccd7226165f933a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:26 -0200 Subject: KVM: convert slots_lock to a mutex Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/i8259.c | 4 ++-- arch/x86/kvm/vmx.c | 8 ++++---- arch/x86/kvm/x86.c | 16 ++++++++-------- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4b433de02e5b..6a74246f80c6 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index b7d145b20953..d5753a75d58c 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -533,9 +533,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 22ab7137d1d0..f04e2ff21383 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2223,7 +2223,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2236,7 +2236,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2245,7 +2245,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.ept_identity_pagetable) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2260,7 +2260,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53bc06a68105..aff3479867a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2208,14 +2208,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; spin_unlock(&kvm->mmu_lock); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -2292,7 +2292,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, if (!aliases) goto out; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); /* invalidate any gfn reference in case of deletion/shrinking */ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); @@ -2328,7 +2328,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, r = 0; out_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); out: return r; } @@ -2462,7 +2462,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, unsigned long is_dirty = 0; unsigned long *dirty_bitmap = NULL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EINVAL; if (log->slot >= KVM_MEMORY_SLOTS) @@ -2512,7 +2512,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, out_free: vfree(dirty_bitmap); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2625,7 +2625,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2634,7 +2634,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { -- cgit v1.2.2 From 0680fe52753381cb7154beeb01ef3e48f2cdeec6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 27 Dec 2009 17:00:46 +0200 Subject: KVM: Bump maximum vcpu count to 64 With slots_lock converted to rcu, the entire kvm hotpath on modern processors (with npt or ept) now scales beautifully. Increase the maximum vcpu count to 64 to reflect this. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6c8c7c578c46..741b8972a3a5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,7 +25,7 @@ #include #include -#define KVM_MAX_VCPUS 16 +#define KVM_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 -- cgit v1.2.2 From f4c9e87c83a9f5bc1800db27dbb39e5cd1254c0a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 28 Dec 2009 16:06:35 +0200 Subject: KVM: Fill out ftrace exit reason strings Some exit reasons missed their strings; fill out the table. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 58 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f04e2ff21383..f4486f460278 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3993,29 +3993,49 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } +#define _ER(x) { EXIT_REASON_##x, #x } + static const struct trace_print_flags vmx_exit_reasons_str[] = { - { EXIT_REASON_EXCEPTION_NMI, "exception" }, - { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, - { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, - { EXIT_REASON_NMI_WINDOW, "nmi_window" }, - { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, - { EXIT_REASON_CR_ACCESS, "cr_access" }, - { EXIT_REASON_DR_ACCESS, "dr_access" }, - { EXIT_REASON_CPUID, "cpuid" }, - { EXIT_REASON_MSR_READ, "rdmsr" }, - { EXIT_REASON_MSR_WRITE, "wrmsr" }, - { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, - { EXIT_REASON_HLT, "halt" }, - { EXIT_REASON_INVLPG, "invlpg" }, - { EXIT_REASON_VMCALL, "hypercall" }, - { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, - { EXIT_REASON_APIC_ACCESS, "apic_access" }, - { EXIT_REASON_WBINVD, "wbinvd" }, - { EXIT_REASON_TASK_SWITCH, "task_switch" }, - { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + _ER(EXCEPTION_NMI), + _ER(EXTERNAL_INTERRUPT), + _ER(TRIPLE_FAULT), + _ER(PENDING_INTERRUPT), + _ER(NMI_WINDOW), + _ER(TASK_SWITCH), + _ER(CPUID), + _ER(HLT), + _ER(INVLPG), + _ER(RDPMC), + _ER(RDTSC), + _ER(VMCALL), + _ER(VMCLEAR), + _ER(VMLAUNCH), + _ER(VMPTRLD), + _ER(VMPTRST), + _ER(VMREAD), + _ER(VMRESUME), + _ER(VMWRITE), + _ER(VMOFF), + _ER(VMON), + _ER(CR_ACCESS), + _ER(DR_ACCESS), + _ER(IO_INSTRUCTION), + _ER(MSR_READ), + _ER(MSR_WRITE), + _ER(MWAIT_INSTRUCTION), + _ER(MONITOR_INSTRUCTION), + _ER(PAUSE_INSTRUCTION), + _ER(MCE_DURING_VMENTRY), + _ER(TPR_BELOW_THRESHOLD), + _ER(APIC_ACCESS), + _ER(EPT_VIOLATION), + _ER(EPT_MISCONFIG), + _ER(WBINVD), { -1, NULL } }; +#undef _ER + static bool vmx_gb_page_enable(void) { return false; -- cgit v1.2.2 From c9c5417455b0c2e3c164883354b5480e5aee3b36 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 5 Jan 2010 19:02:26 +0800 Subject: KVM: x86: Moving PT_*_LEVEL to mmu.h We can use them in x86.c and vmx.c now... Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 4 ---- arch/x86/kvm/mmu.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 25aabd00aa01..12ccf14f8539 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -143,10 +143,6 @@ module_param(oos_shadow, bool, 0644); #define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 4567d8042b22..ff583423968d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -38,6 +38,10 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 17cc393596823f4bbab81e68a9e23e7beadbcfca Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 5 Jan 2010 19:02:27 +0800 Subject: KVM: x86: Rename gb_page_enable() to get_lpage_level() in kvm_x86_ops Then the callback can provide the maximum supported large page level, which is more flexible. Also move the gb page support into x86_64 specific. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 6 +++--- arch/x86/kvm/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 4 +++- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 741b8972a3a5..a4de557ad733 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -528,7 +528,7 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - bool (*gb_page_enable)(void); + int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); const struct trace_print_flags *exit_reasons_str; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b373ae6fb974..cf64fc026e3e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2911,9 +2911,9 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void) { - return true; + return PT_PDPE_LEVEL; } static bool svm_rdtscp_supported(void) @@ -2986,7 +2986,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, .cpuid_update = svm_cpuid_update, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f4486f460278..0fd0892553ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4036,9 +4036,9 @@ static const struct trace_print_flags vmx_exit_reasons_str[] = { #undef _ER -static bool vmx_gb_page_enable(void) +static int vmx_get_lpage_level(void) { - return false; + return PT_DIRECTORY_LEVEL; } static inline u32 bit(int bitno) @@ -4131,7 +4131,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, - .gb_page_enable = vmx_gb_page_enable, + .get_lpage_level = vmx_get_lpage_level, .cpuid_update = vmx_cpuid_update, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aff3479867a8..c990424d86d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1641,10 +1641,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; - unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; unsigned f_lm = F(LM); #else + unsigned f_gbpages = 0; unsigned f_lm = 0; #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; -- cgit v1.2.2 From 878403b788bff1af9c7f1a61e104f0c77115af29 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 5 Jan 2010 19:02:29 +0800 Subject: KVM: VMX: Enable EPT 1GB page support Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/mmu.c | 8 +++++--- arch/x86/kvm/vmx.c | 11 ++++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 713ed9a5b1d3..43f1e9b45917 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -364,6 +364,7 @@ enum vmcs_field { #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 12ccf14f8539..4f5508c35100 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -500,8 +500,7 @@ out: static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level; - int level = PT_PAGE_TABLE_LEVEL; + int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) @@ -512,7 +511,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0fd0892553ec..9b197b25b66d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -318,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void) return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); } +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -4038,7 +4043,11 @@ static const struct trace_print_flags vmx_exit_reasons_str[] = { static int vmx_get_lpage_level(void) { - return PT_DIRECTORY_LEVEL; + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; } static inline u32 bit(int bitno) -- cgit v1.2.2 From 0d178975d0a5afe5e0fd3211bd1397905b225be5 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 6 Jan 2010 17:55:23 +0900 Subject: KVM: Fix the explanation of write_emulated The explanation of write_emulated is confused with that of read_emulated. This patch fix it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7c18e1230f54..9b697c2735d9 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -74,7 +74,7 @@ struct x86_emulate_ops { struct kvm_vcpu *vcpu); /* - * write_emulated: Read bytes from emulated/special memory area. + * write_emulated: Write bytes to emulated/special memory area. * @addr: [IN ] Linear address to which to write. * @val: [IN ] Value to write to memory (low-order bytes used as * required). -- cgit v1.2.2 From a1f83a74feaa9718a5c61587256ea6cc1b993d16 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Dec 2009 17:33:58 +0200 Subject: KVM: VMX: trace clts and lmsw instructions as cr accesses clts writes cr0.ts; lmsw writes cr0[0:15] - record that in ftrace. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9b197b25b66d..7c7b2eeea5d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2997,6 +2997,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + trace_kvm_cr_write(0, vcpu->arch.cr0); vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -3016,7 +3017,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 3: /* lmsw */ - kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (vcpu->arch.cr0 & ~0xful) | val); + kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); return 1; -- cgit v1.2.2 From 4d4ec0874583b127caac1d0f84033c8971b2fd2a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Dec 2009 18:07:30 +0200 Subject: KVM: Replace read accesses of vcpu->arch.cr0 by an accessor Since we'd like to allow the guest to own a few bits of cr0 at times, we need to know when we access those bits. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 +++--- arch/x86/kvm/kvm_cache_regs.h | 10 ++++++++++ arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/svm.c | 9 +++++---- arch/x86/kvm/vmx.c | 16 ++++++++-------- arch/x86/kvm/x86.c | 20 ++++++++++---------- 7 files changed, 38 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e8faea4651e..0f89e320bc96 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1515,7 +1515,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) /* syscall is not available in real mode */ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) + || !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) return -1; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1569,7 +1569,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) /* inject #GP if in real mode or paging is disabled */ if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } @@ -1635,7 +1635,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) /* inject #GP if in real mode or paging is disabled */ if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + || !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 35acc36e1782..f46859751b30 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -38,6 +38,16 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { if (mask & vcpu->arch.cr4_guest_owned_bits) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4f5508c35100..276bf7497c36 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static int is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_WP; + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } static int is_cpuid_PSE36(void) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ff583423968d..599159f728b9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -79,7 +79,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu) static inline int is_paging(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_PG; + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } static inline int is_present_gpte(unsigned long pte) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cf64fc026e3e..d3246ce70ae8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -980,7 +980,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (npt_enabled) goto set; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { + if (kvm_read_cr0_bits(vcpu, X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); vcpu->fpu_active = 1; } @@ -1244,7 +1244,7 @@ static int ud_interception(struct vcpu_svm *svm) static int nm_interception(struct vcpu_svm *svm) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) + if (!kvm_read_cr0_bits(&svm->vcpu, X86_CR0_TS)) svm->vmcb->save.cr0 &= ~X86_CR0_TS; svm->vcpu.fpu_active = 1; @@ -1743,7 +1743,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -2387,7 +2387,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (npt_enabled) { int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { + if ((kvm_read_cr0_bits(vcpu, X86_CR0_PG) ^ svm->vmcb->save.cr0) + & X86_CR0_PG) { svm_set_cr0(vcpu, svm->vmcb->save.cr0); mmu_reload = 1; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7c7b2eeea5d0..4c7177c489ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -799,7 +799,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) return; vcpu->fpu_active = 1; vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) + if (kvm_read_cr0_bits(vcpu, X86_CR0_TS)) vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); } @@ -1785,7 +1785,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) + if (kvm_read_cr0_bits(vcpu, X86_CR0_PE)) vmx_fpu_deactivate(vcpu); } @@ -1840,7 +1840,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) /* if real mode */ return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2095,7 +2095,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2580,7 +2580,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); @@ -2996,8 +2996,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 2: /* clts */ vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - trace_kvm_cr_write(0, vcpu->arch.cr0); + vmcs_writel(CR0_READ_SHADOW, kvm_read_cr0(vcpu)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -3018,7 +3018,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (vcpu->arch.cr0 & ~0xful) | val); + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c990424d86d0..748b15d8e46d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -430,7 +430,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); + cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } @@ -488,7 +488,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -3095,7 +3095,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); return X86EMUL_CONTINUE; } @@ -3714,7 +3714,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) switch (cr) { case 0: - value = vcpu->arch.cr0; + value = kvm_read_cr0(vcpu); break; case 2: value = vcpu->arch.cr2; @@ -3741,7 +3741,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, { switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); *rflags = kvm_get_rflags(vcpu); break; case 2: @@ -4335,7 +4335,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - sregs->cr0 = vcpu->arch.cr0; + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; sregs->cr4 = kvm_read_cr4(vcpu); @@ -4521,7 +4521,7 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct kvm_segment kvm_seg; - if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; @@ -4799,7 +4799,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) &nseg_desc); } - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); @@ -4834,7 +4834,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; @@ -4873,7 +4873,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(vcpu->arch.cr0 & X86_CR0_PE)) + !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); -- cgit v1.2.2 From e8467fda83cdc9de53972fee0cd2e6916cf66f41 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Dec 2009 18:43:06 +0200 Subject: KVM: VMX: Allow the guest to own some cr0 bits We will use this later to give the guest ownership of cr0.ts. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/kvm_cache_regs.h | 2 ++ arch/x86/kvm/svm.c | 5 +++++ arch/x86/kvm/vmx.c | 9 +++++++++ 4 files changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a4de557ad733..693046a7a12d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -269,6 +269,7 @@ struct kvm_vcpu_arch { u32 regs_dirty; unsigned long cr0; + unsigned long cr0_guest_owned_bits; unsigned long cr2; unsigned long cr3; unsigned long cr4; @@ -489,6 +490,7 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index f46859751b30..6b419a36cbd9 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -40,6 +40,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { + if (mask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); return vcpu->arch.cr0 & mask; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d3246ce70ae8..3899c2d19830 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -956,6 +956,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) svm->vmcb->save.gdtr.base = dt->base ; } +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +} + static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } @@ -2948,6 +2952,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4c7177c489ac..dbcdb55094f7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1653,6 +1653,14 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; @@ -4106,6 +4114,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, -- cgit v1.2.2 From 02daab21d94dc4cf01b2fd09863d59a436900322 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 Dec 2009 12:40:26 +0200 Subject: KVM: Lazify fpu activation and deactivation Defer fpu deactivation as much as possible - if the guest fpu is loaded, keep it loaded until the next heavyweight exit (where we are forced to unload it). This reduces unnecessary exits. We also defer fpu activation on clts; while clts signals the intent to use the fpu, we can't be sure the guest will actually use it. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 35 +++++++++++++++++++++-------------- arch/x86/kvm/vmx.c | 25 +++++++++---------------- arch/x86/kvm/x86.c | 7 ++++++- 4 files changed, 37 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 693046a7a12d..93bee7abb71c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -506,6 +506,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3899c2d19830..5b336a80f31e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -984,17 +984,11 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (npt_enabled) goto set; - if (kvm_read_cr0_bits(vcpu, X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } - vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } set: /* * re-enable caching here because the QEMU bios @@ -1250,6 +1244,8 @@ static int nm_interception(struct vcpu_svm *svm) svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); if (!kvm_read_cr0_bits(&svm->vcpu, X86_CR0_TS)) svm->vmcb->save.cr0 &= ~X86_CR0_TS; + else + svm->vmcb->save.cr0 |= X86_CR0_TS; svm->vcpu.fpu_active = 1; return 1; @@ -2586,6 +2582,8 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu) static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { + if (npt_enabled) + vcpu->fpu_active = 1; } static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) @@ -2805,12 +2803,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = root; force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } } static int is_disabled(void) @@ -2926,6 +2918,20 @@ static bool svm_rdtscp_supported(void) return false; } +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (npt_enabled) { + /* hack: npt requires active fpu at this time */ + vcpu->fpu_active = 1; + return; + } + + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; + svm->vmcb->save.cr0 |= X86_CR0_TS; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2967,6 +2973,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index dbcdb55094f7..d11be3fb7c80 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -66,7 +66,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); #define KVM_GUEST_CR0_MASK \ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) + (X86_CR0_WP | X86_CR0_NE | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ @@ -579,9 +579,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) + | (1u << NM_VECTOR); /* * Unconditionally intercept #DB so we can maintain dr6 without * reading it every exit. @@ -595,6 +594,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -806,9 +807,6 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); } @@ -1737,8 +1735,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) else hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - vmx_fpu_deactivate(vcpu); - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -1757,12 +1753,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS; + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -1793,8 +1789,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (kvm_read_cr0_bits(vcpu, X86_CR0_PE)) - vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -3002,11 +2996,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, kvm_read_cr0(vcpu)); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -4127,6 +4119,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 748b15d8e46d..1de2ad7a004d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1509,8 +1509,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -4006,6 +4006,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } } preempt_disable(); @@ -5075,6 +5079,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.guest_fx_image); kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); } EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); -- cgit v1.2.2 From edcafe3c5a06f46407c3f60145a36f269e56ff7f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 Dec 2009 18:07:40 +0200 Subject: KVM: VMX: Give the guest ownership of cr0.ts when the fpu is active If the guest fpu is loaded, there is nothing interesing about cr0.ts; let the guest play with it as it will. This makes context switches between fpu intensive guest processes faster, as we won't trap the clts and cr0 write instructions. [marcelo: fix cr0 read shadow update on fpu deactivation; kills F8 install] Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d11be3fb7c80..fc1964d5e97c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -803,12 +803,20 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) if (kvm_read_cr0_bits(vcpu, X86_CR0_TS)) vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { + vmx_decache_cr0_guest_bits(vcpu); vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -2996,8 +3004,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, kvm_read_cr0(vcpu)); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); return 1; -- cgit v1.2.2 From f9a48e6a18c210c4bf34769fa05ede250575c6a1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 6 Jan 2010 19:10:22 +0200 Subject: KVM: Set cr0.et when the guest writes cr0 Follow the hardware. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1de2ad7a004d..1ad34d185da9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -428,6 +428,8 @@ out: void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + cr0 |= X86_CR0_ET; + if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, kvm_read_cr0(vcpu)); -- cgit v1.2.2 From dc77270f960a8b3cc39a6349a9fad58cc6053d53 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 6 Jan 2010 13:13:01 +0200 Subject: KVM: SVM: Fix SVM_CR0_SELECTIVE_MASK Instead of selecting TS and MP as the comments say, the macro included TS and PE. Luckily the macro is unused now, but fix in order to save a few hours of debugging from anyone who attempts to use it. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1fecb7e61130..38638cd2fa4c 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_ERR -1 -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" -- cgit v1.2.2 From bff7827479ed004c0394e6e7b35ae601bb1a97ad Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jan 2010 13:16:08 +0200 Subject: KVM: SVM: Initialize fpu_active in init_vmcb() init_vmcb() sets up the intercepts as if the fpu is active, so initialize it there. This avoids an INIT from setting up intercepts inconsistent with fpu_active. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5b336a80f31e..949990e4a22e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + svm->vcpu.fpu_active = 1; + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; @@ -730,7 +732,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; -- cgit v1.2.2 From 888f9f3e0cfa32baf05b3840f0248f5502292a0f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jan 2010 12:14:04 +0200 Subject: KVM: SVM: Restore unconditional cr0 intercept under npt Currently we don't intercept cr0 at all when npt is enabled. This improves performance but requires us to activate the fpu at all times. Remove this behaviour in preparation for adding selective cr0 intercepts. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 949990e4a22e..27273ed24c41 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -643,10 +643,8 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); - control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); - control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; save->cr3 = 0; save->cr4 = 0; @@ -982,15 +980,13 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } } #endif - if (npt_enabled) - goto set; - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; + + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; -set: /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -2386,21 +2382,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); - if (npt_enabled) { - int mmu_reload = 0; - if ((kvm_read_cr0_bits(vcpu, X86_CR0_PG) ^ svm->vmcb->save.cr0) - & X86_CR0_PG) { - svm_set_cr0(vcpu, svm->vmcb->save.cr0); - mmu_reload = 1; - } + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (mmu_reload) { - kvm_mmu_reset_context(vcpu); - kvm_mmu_load(vcpu); - } - } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; -- cgit v1.2.2 From d225157bc6a442b1214882635fbf287d7d0e8133 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 6 Jan 2010 10:55:27 +0200 Subject: KVM: SVM: Selective cr0 intercept If two conditions apply: - no bits outside TS and EM differ between the host and guest cr0 - the fpu is active then we can activate the selective cr0 write intercept and drop the unconditional cr0 read and write intercept, and allow the guest to run with the host fpu state. This reduces cr0 exits due to guest fpu management while the guest fpu is loaded. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 27273ed24c41..83c7ab1bdad8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -571,6 +571,7 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -963,6 +964,27 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + } else { + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -994,6 +1016,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + update_cr0_intercept(svm); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1239,11 +1262,8 @@ static int ud_interception(struct vcpu_svm *svm) static int nm_interception(struct vcpu_svm *svm) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!kvm_read_cr0_bits(&svm->vcpu, X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; - else - svm->vmcb->save.cr0 |= X86_CR0_TS; svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); return 1; } @@ -2296,7 +2316,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, @@ -2914,8 +2934,8 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) return; } + update_cr0_intercept(svm); svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; - svm->vmcb->save.cr0 |= X86_CR0_TS; } static struct kvm_x86_ops svm_x86_ops = { -- cgit v1.2.2 From 4610c83cdc8bff04f2f22883749f716b1ccc502f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jan 2010 12:19:20 +0200 Subject: KVM: SVM: Lazy fpu with npt Now that we can allow the guest to play with cr0 when the fpu is loaded, we can enable lazy fpu when npt is in use. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 83c7ab1bdad8..8d7cb62ebef6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2588,8 +2588,6 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu) static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { - if (npt_enabled) - vcpu->fpu_active = 1; } static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) @@ -2928,12 +2926,6 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (npt_enabled) { - /* hack: npt requires active fpu at this time */ - vcpu->fpu_active = 1; - return; - } - update_cr0_intercept(svm); svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; } -- cgit v1.2.2 From 1d5103c11e32b5028262c073d56375691d51a886 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 17 Jan 2010 15:51:21 +0200 Subject: KVM: Add HYPER-V header file Provide HYPER-V related defines that will be used by following patches. Signed-off-by: Gleb Natapov Signed-off-by: Vadim Rozenfeld Signed-off-by: Avi Kivity --- arch/x86/include/asm/hyperv.h | 186 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 arch/x86/include/asm/hyperv.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h new file mode 100644 index 000000000000..e153a2b3889a --- /dev/null +++ b/arch/x86/include/asm/hyperv.h @@ -0,0 +1,186 @@ +#ifndef _ASM_X86_KVM_HYPERV_H +#define _ASM_X86_KVM_HYPERV_H + +#include + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) + /* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS (1 << 0) +#define HV_X64_ACCESS_PARTITION_ID (1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) +#define HV_X64_POST_MESSAGES (1 << 4) +#define HV_X64_SIGNAL_EVENTS (1 << 5) +#define HV_X64_CREATE_PORT (1 << 6) +#define HV_X64_CONNECT_PORT (1 << 7) +#define HV_X64_ACCESS_STATS (1 << 8) +#define HV_X64_DEBUGGING (1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) +#define HV_X64_CONFIGURE_PROFILER (1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE (1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction + */ +#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX 0x40000002 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0 0 +#define HV_PROCESSOR_POWER_STATE_C1 1 +#define HV_PROCESSOR_POWER_STATE_C2 2 +#define HV_PROCESSOR_POWER_STATE_C3 3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 + +#endif -- cgit v1.2.2 From 55cd8e5a4edb8e235163ffe8264b9aaa8d7c050f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 17 Jan 2010 15:51:22 +0200 Subject: KVM: Implement bare minimum of HYPER-V MSRs Minimum HYPER-V implementation should have GUEST_OS_ID, HYPERCALL and VP_INDEX MSRs. [avi: fix build on i386] Signed-off-by: Gleb Natapov Signed-off-by: Vadim Rozenfeld Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 + arch/x86/include/asm/kvm_para.h | 1 + arch/x86/kvm/trace.h | 32 +++++++ arch/x86/kvm/x86.c | 193 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 229 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 93bee7abb71c..67d19e422006 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -413,6 +413,10 @@ struct kvm_arch { s64 kvmclock_offset; struct kvm_xen_hvm_config xen_hvm_config; + + /* fields used by HYPER-V emulation */ + u64 hv_guest_os_id; + u64 hv_hypercall; }; struct kvm_vm_stat { diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c584076a47f4..ffae1420e7d7 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include +#include /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 816e0449db0b..1cb3d0e990f3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -55,6 +55,38 @@ TRACE_EVENT(kvm_hypercall, __entry->a3) ); +/* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, + TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, + __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + + TP_STRUCT__entry( + __field( __u16, code ) + __field( bool, fast ) + __field( __u16, rep_cnt ) + __field( __u16, rep_idx ) + __field( __u64, ingpa ) + __field( __u64, outgpa ) + ), + + TP_fast_assign( + __entry->code = code; + __entry->fast = fast; + __entry->rep_cnt = rep_cnt; + __entry->rep_idx = rep_idx; + __entry->ingpa = ingpa; + __entry->outgpa = outgpa; + ), + + TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + __entry->code, __entry->fast ? "fast" : "slow", + __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, + __entry->outgpa) +); + /* * Tracepoint for PIO. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ad34d185da9..480137db4770 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -622,9 +622,10 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 2 +#define KVM_SAVE_MSRS_BEGIN 4 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -1004,6 +1005,74 @@ out: return r; } +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copy_to_user((void __user *)addr, instructions, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x data 0x%llx\n", + msr, data); + + return 1; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1118,6 +1187,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1217,6 +1296,48 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1283,6 +1404,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1398,6 +1529,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XEN_HVM: case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3618,11 +3750,70 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, return a0 | ((gpa_t)a1 << 32); } +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || + !kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = (kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffff); + ingpa = (kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffff); + outgpa = (kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + res = HV_STATUS_INVALID_HYPERCALL_CODE; + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; int r = 1; + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); -- cgit v1.2.2 From 10388a07164c1512b3a3d0273b9adc230f82790e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 17 Jan 2010 15:51:23 +0200 Subject: KVM: Add HYPER-V apic access MSRs Implement HYPER-V apic MSRs. Spec defines three MSRs that speed-up access to EOI/TPR/ICR apic registers for PV guests. Signed-off-by: Gleb Natapov Signed-off-by: Vadim Rozenfeld Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/lapic.c | 31 ++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.h | 8 ++++++++ arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++++++++---- 4 files changed, 79 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 67d19e422006..a1f0b5dd7d75 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -363,6 +363,8 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ u16 singlestep_cs; unsigned long singlestep_rip; + /* fields used by HYPER-V emulation */ + u64 hv_vapic; }; struct kvm_mem_alias { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba8c045da782..4b224f90087b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + /* if this is ICR write vector before command */ + if (reg == APIC_ICR) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (reg == APIC_ICR) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 40010b09c4aa..f5fe32c5edad 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +} #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 480137db4770..552be51e4d84 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -622,10 +622,11 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 4 +#define KVM_SAVE_MSRS_BEGIN 5 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -1067,10 +1068,36 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x data 0x%llx\n", - msr, data); + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; - return 1; + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; } int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) @@ -1330,6 +1357,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = r; break; } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1530,6 +1563,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: + case KVM_CAP_HYPERV_VAPIC: r = 1; break; case KVM_CAP_COALESCED_MMIO: -- cgit v1.2.2 From c25bc1638a1211f57cccbabdd8b732813b852340 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 17 Jan 2010 15:51:24 +0200 Subject: KVM: Implement NotifyLongSpinWait HYPER-V hypercall Windows issues this hypercall after guest was spinning on a spinlock for too many iterations. Signed-off-by: Gleb Natapov Signed-off-by: Vadim Rozenfeld Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 552be51e4d84..9f72a443455b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1564,6 +1564,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: + case KVM_CAP_HYPERV_SPIN: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3827,7 +3828,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - res = HV_STATUS_INVALID_HYPERCALL_CODE; + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } ret = res | (((u64)rep_done & 0xfff) << 32); if (longmode) { -- cgit v1.2.2 From 8dae444529230301bc85fc86033aa06a734c1a29 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 18 Jan 2010 18:45:10 +0900 Subject: KVM: rename is_writeble_pte() to is_writable_pte() There are two spellings of "writable" in arch/x86/kvm/mmu.c and paging_tmpl.h . This patch renames is_writeble_pte() to is_writable_pte() and makes grepping easy. New name is consistent with the definition of itself: return pte & PT_WRITABLE_MASK; Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 +++++++++--------- arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 276bf7497c36..ff2b2e8d72eb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -250,7 +250,7 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_writeble_pte(unsigned long pte) +static int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -632,7 +632,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { @@ -708,7 +708,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } @@ -732,7 +732,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; __set_spte(spte, shadow_trap_nonpresent_pte); @@ -787,7 +787,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); __set_spte(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); @@ -1847,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*sptep)) + if (!can_unsync && is_writable_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1855,7 +1855,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) + if (is_writable_pte(spte)) spte &= ~PT_WRITABLE_MASK; } } @@ -1876,7 +1876,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*sptep); + int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -1927,7 +1927,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { - if (was_writeble) + if (was_writable) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ede2131a9225..df15a5307d2d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -162,7 +162,7 @@ walk: if (rsvd_fault) goto access_error; - if (write_fault && !is_writeble_pte(pte)) + if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; -- cgit v1.2.2 From 94718da12741ef44e1eb2bfe3ca37db92115a3d3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 18 Jan 2010 13:26:34 +0200 Subject: KVM: export Needed by . Signed-off-by: Avi Kivity --- arch/x86/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 9f828f87ca35..493092efaa3b 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,6 +11,7 @@ header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h header-y += hw_breakpoint.h +header-y += hyperv.h unifdef-y += e820.h unifdef-y += ist.h -- cgit v1.2.2 From 7062dcaa369cae40c1a59949a5654985d0ae26ea Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 19 Jan 2010 17:43:21 +0800 Subject: KVM: VMX: Remove emulation failure report As Avi noted: >There are two problems with the kernel failure report. First, it >doesn't report enough data - registers, surrounding instructions, etc. >that are needed to explain what is going on. Second, it can flood >dmesg, which is a pretty bad thing to do. So we remove the emulation failure report in handle_invalid_guest_state(), and would inspected the guest using userspace tool in the future. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fc1964d5e97c..516084f3c6d8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3449,7 +3449,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - kvm_report_emulation_failure(vcpu, "emulation failure"); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; -- cgit v1.2.2 From ccd469362e826261ccc261c4c36fb0a346338222 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 19 Jan 2010 15:06:38 +0200 Subject: KVM: fix Hyper-V hypercall warnings and wrong mask value Fix compilation warnings and wrong mask value. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f72a443455b..9b0758b4d550 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3806,12 +3806,12 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) longmode = is_long_mode(vcpu) && cs_l == 1; if (!longmode) { - param = (kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffff); - ingpa = (kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffff); - outgpa = (kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffff); + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); } #ifdef CONFIG_X86_64 else { -- cgit v1.2.2 From 647492047763c3ee8fe51ecf9a04f39040aa495b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 19 Jan 2010 12:45:23 -0200 Subject: KVM: fix cleanup_srcu_struct on vm destruction cleanup_srcu_struct on VM destruction remains broken: BUG: unable to handle kernel paging request at ffffffffffffffff IP: [] srcu_read_lock+0x16/0x21 RIP: 0010:[] [] srcu_read_lock+0x16/0x21 Call Trace: [] kvm_arch_vcpu_uninit+0x1b/0x48 [kvm] [] kvm_vcpu_uninit+0x9/0x15 [kvm] [] vmx_free_vcpu+0x7f/0x8f [kvm_intel] [] kvm_arch_destroy_vm+0x78/0x111 [kvm] [] kvm_put_kvm+0xd4/0xfe [kvm] Move it to kvm_arch_destroy_vm. Signed-off-by: Marcelo Tosatti Reported-by: Jan Kiszka --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b0758b4d550..322c2c5f9bc4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5547,6 +5547,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm->arch.aliases); kfree(kvm); } -- cgit v1.2.2 From b60d513c32e2ddc8b3e9e1465b94913d44d19810 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 20 Jan 2010 16:47:21 +0900 Subject: KVM: x86: Use macros for x86_emulate_ops to avoid future mistakes The return values from x86_emulate_ops are defined in kvm_emulate.h as macros X86EMUL_*. But in emulate.c, we are comparing the return values from these ops with 0 to check if they're X86EMUL_CONTINUE or not: X86EMUL_CONTINUE is defined as 0 now. To avoid possible mistakes in the future, this patch substitutes "X86EMUL_CONTINUE" for "0" that are being compared with the return values from x86_emulate_ops. We think that there are more places we should use these macros, but the meanings of rc values in x86_emulate_insn() were not so clear at a glance. If we use proper macros in this function, we would be able to follow the flow of each emulation more easily and, maybe, more securely. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 65 ++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0f89e320bc96..48c7f9f8a08f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1196,7 +1196,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), dest, len, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); @@ -1370,7 +1370,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, int rc; rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || @@ -1385,7 +1385,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, (u32) c->regs[VCPU_REGS_RBX]; rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags |= EFLG_ZF; } @@ -1451,7 +1451,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; break; case OP_NONE: @@ -1749,7 +1749,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) &c->src.val, c->src.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } @@ -1768,12 +1768,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) c->dst.ptr = (void *)c->dst.ptr + (c->src.val & mask) / 8; } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; + if (!(c->d & Mov)) { + /* optimisation - avoid slow emulated read */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } } c->dst.orig_val = c->dst.val; @@ -2039,11 +2042,12 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2058,10 +2062,11 @@ special_insn: c->src.ptr = (unsigned long *)register_address(c, seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2069,10 +2074,11 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); @@ -2102,12 +2108,13 @@ special_insn: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes -- cgit v1.2.2 From f2483415293b180945da707e7dbe74e5daa72651 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: VMX: Fix exceptions of mov to dr Injecting GP without an error code is a bad idea (causes unhandled guest exits). Moreover, we must not skip the instruction if we injected an exception. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 516084f3c6d8..9727773f24b7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3045,6 +3045,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) unsigned long val; int dr, reg; + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; dr = vmcs_readl(GUEST_DR7); @@ -3099,20 +3100,22 @@ static int handle_dr(struct kvm_vcpu *vcpu) vcpu->arch.eff_db[dr] = val; break; case 4 ... 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } break; case 6: if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; case 7: if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { -- cgit v1.2.2 From 138ac8d88f91e2a6a278aa5cee9120c714c4ce2d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: VMX: Fix emulation of DR4 and DR5 Make sure DR4 and DR5 are aliased to DR6 and DR7, respectively, if CR4.DE is not set. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9727773f24b7..c7b99e1f8aae 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3039,6 +3039,15 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 0; } +static int check_dr_alias(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return -1; + } + return 0; +} + static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; @@ -3081,14 +3090,20 @@ static int handle_dr(struct kvm_vcpu *vcpu) case 0 ... 3: val = vcpu->arch.db[dr]; break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: val = vcpu->arch.dr6; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ val = vcpu->arch.dr7; break; - default: - val = 0; } kvm_register_write(vcpu, reg, val); } else { @@ -3099,12 +3114,10 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; - case 4 ... 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + case 4: + if (check_dr_alias(vcpu) < 0) return 1; - } - break; + /* fall through */ case 6: if (val & 0xffffffff00000000ULL) { kvm_inject_gp(vcpu, 0); @@ -3112,7 +3125,11 @@ static int handle_dr(struct kvm_vcpu *vcpu) } vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ if (val & 0xffffffff00000000ULL) { kvm_inject_gp(vcpu, 0); return 1; -- cgit v1.2.2 From fd7373cce767a8803e79f51bd3fc5f531234657f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: VMX: Clean up DR6 emulation As we trap all debug register accesses, we do not need to switch real DR6 at all. Clean up update_exception_bitmap at this chance, too. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c7b99e1f8aae..b7e812e9c299 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -579,17 +579,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) - | (1u << NM_VECTOR); - /* - * Unconditionally intercept #DB so we can maintain dr6 without - * reading it every exit. - */ - eb |= 1u << DB_VECTOR; - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - eb |= 1u << BP_VECTOR; - } + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << NM_VECTOR) | (1u << DB_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) @@ -3777,9 +3772,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ vmcs_writel(HOST_CR0, read_cr0()); - if (vcpu->arch.switch_db_regs) - set_debugreg(vcpu->arch.dr6, 6); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3880,9 +3872,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - if (vcpu->arch.switch_db_regs) - get_debugreg(vcpu->arch.dr6, 6); - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); -- cgit v1.2.2 From c76de350c8a3ba770becc17eaa744dc3c7642295 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: SVM: Clean up and enhance mov dr emulation Enhance mov dr instruction emulation used by SVM so that it properly handles dr4/5: alias to dr6/7 if cr4.de is cleared. Otherwise return EMULATE_FAIL which will let our only possible caller in that scenario, ud_interception, re-inject UD. We do not need to inject faults, SVM does this for us (exceptions take precedence over instruction interceptions). For the same reason, the value overflow checks can be removed. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 5 ++-- arch/x86/kvm/svm.c | 64 +++++++++++++++++++---------------------- arch/x86/kvm/x86.c | 19 ++---------- 3 files changed, 33 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a1f0b5dd7d75..d73ed48587e4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -506,9 +506,8 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); - void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception); + int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); + int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8d7cb62ebef6..4295dfcc6031 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1122,76 +1122,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->vmcb->control.asid = sd->next_asid++; } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long val; switch (dr) { case 0 ... 3: - val = vcpu->arch.db[dr]; + *dest = vcpu->arch.db[dr]; break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr6; + *dest = vcpu->arch.dr6; else - val = svm->vmcb->save.dr6; + *dest = svm->vmcb->save.dr6; break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr7; + *dest = vcpu->arch.dr7; else - val = svm->vmcb->save.dr7; + *dest = svm->vmcb->save.dr7; break; - default: - val = 0; } - return val; + return EMULATE_DONE; } -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; - switch (dr) { case 0 ... 3: vcpu->arch.db[dr] = value; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - *exception = UD_VECTOR; - return; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - return; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { svm->vmcb->save.dr7 = vcpu->arch.dr7; vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); } - return; - default: - /* FIXME: Possible case? */ - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __func__, dr); - *exception = UD_VECTOR; - return; + break; } + + return EMULATE_DONE; } static int pf_interception(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 322c2c5f9bc4..fd5101b57fa3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3270,29 +3270,14 @@ int emulate_clts(struct kvm_vcpu *vcpu) int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); - return X86EMUL_UNHANDLEABLE; - } + return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; + return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -- cgit v1.2.2 From 727f5a23e2080b19da392f17fa3158a0941f81f5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: SVM: Trap all debug register accesses To enable proper debug register emulation under all conditions, trap access to all DR0..7. This may be optimized later on. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4295dfcc6031..a281368c3b96 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -554,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | @@ -2319,11 +2325,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, [SVM_EXIT_WRITE_DR0] = emulate_on_interception, [SVM_EXIT_WRITE_DR1] = emulate_on_interception, [SVM_EXIT_WRITE_DR2] = emulate_on_interception, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, -- cgit v1.2.2 From ab344828ebe729e52949d64046adaa196f6b9dbe Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 21 Jan 2010 15:28:46 +0200 Subject: KVM: x86: fix checking of cr0 validity Move to/from Control Registers chapter of Intel SDM says. "Reserved bits in CR0 remain clear after any load of those registers; attempts to set them have no impact". Control Register chapter says "Bits 63:32 of CR0 are reserved and must be written with zeros. Writing a nonzero value to any of the upper 32 bits results in a general-protection exception, #GP(0)." This patch tries to implement this twisted logic. Signed-off-by: Gleb Natapov Reported-by: Lorenzo Martignoni Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fd5101b57fa3..ce267d9f0305 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -430,12 +430,16 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { cr0 |= X86_CR0_ET; - if (cr0 & CR0_RESERVED_BITS) { +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } +#endif + + cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); -- cgit v1.2.2 From 2608d7a12fd4badfe2eac534150f0715a7fd3ff3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:45 +0200 Subject: KVM: Allow kvm_load_guest_fpu() even when !vcpu->fpu_active This allows accessing the guest fpu from the instruction emulator, as well as being symmetric with kvm_put_guest_fpu(). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce267d9f0305..4cf4eac03bbc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4239,7 +4239,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); local_irq_disable(); @@ -5285,7 +5286,7 @@ EXPORT_SYMBOL_GPL(fx_init); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + if (vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; -- cgit v1.2.2 From e5bb40251a920cdd9d12c569c6aab0bdd0279e4e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:46 +0200 Subject: KVM: Drop kvm_{load,put}_guest_fpu() exports Not used anymore. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cf4eac03bbc..c61ec9c69267 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5293,7 +5293,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { @@ -5306,7 +5305,6 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) ++vcpu->stat.fpu_reload; set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { -- cgit v1.2.2 From 6b52d18605f580bdffaffd48c8da228c3e848deb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:47 +0200 Subject: KVM: Activate fpu on clts Assume that if the guest executes clts, it knows what it's doing, and load the guest fpu to prevent an #NM exception. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 8 +++++++- arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d73ed48587e4..7ebf9fe670cd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -511,6 +511,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a281368c3b96..800208a60a51 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1259,12 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } -static int nm_interception(struct vcpu_svm *svm) +static void svm_fpu_activate(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); svm->vcpu.fpu_active = 1; update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); return 1; } @@ -2977,6 +2982,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b7e812e9c299..fad871cbed19 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3002,6 +3002,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c61ec9c69267..4db0c8a9082e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3269,6 +3269,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); return X86EMUL_CONTINUE; } -- cgit v1.2.2 From 3eeb3288bcbf64da90afc26389b8844df7c34912 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:48 +0200 Subject: KVM: Add a helper for checking if the guest is in protected mode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 9 ++++----- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 7 +++---- arch/x86/kvm/x86.h | 6 ++++++ 4 files changed, 15 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 48c7f9f8a08f..6a429eefc533 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -32,6 +32,7 @@ #include #include +#include "x86.h" #include "mmu.h" /* for is_long_mode() */ /* @@ -1515,7 +1516,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) /* syscall is not available in real mode */ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) + || !is_protmode(ctxt->vcpu)) return -1; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1568,8 +1569,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) return -1; /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) { + if (ctxt->mode == X86EMUL_MODE_REAL || !is_protmode(ctxt->vcpu)) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } @@ -1634,8 +1634,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) return -1; /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) { + if (ctxt->mode == X86EMUL_MODE_REAL || !is_protmode(ctxt->vcpu)) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fad871cbed19..2e894954069f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1845,7 +1845,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) /* if real mode */ + if (!is_protmode(vcpu)) return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2100,7 +2100,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4db0c8a9082e..a4a7d1892f72 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3786,8 +3786,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || - !kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } @@ -4751,7 +4750,7 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct kvm_segment kvm_seg; - if (is_vm86_segment(vcpu, seg) || !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; @@ -5103,7 +5102,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) + !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea585d2a..f783d8fe0d1d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include +#include "kvm_cache_regs.h" static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { @@ -35,4 +36,9 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + #endif -- cgit v1.2.2 From 836a1b3c3456042704c86aaa3d837b976de9343b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:49 +0200 Subject: KVM: Move cr0/cr4/efer related helpers to x86.h They have more general scope than the mmu. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 1 - arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/mmu.h | 24 ------------------------ arch/x86/kvm/x86.h | 24 ++++++++++++++++++++++++ 4 files changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6a429eefc533..645b245a3c23 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -33,7 +33,6 @@ #include #include "x86.h" -#include "mmu.h" /* for is_long_mode() */ /* * Opcode effective-address decode tables. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff2b2e8d72eb..6f7158f4fbfd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "x86.h" #include "kvm_cache_regs.h" #include diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 599159f728b9..61ef5a65b7d8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -58,30 +58,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, X86_CR0_PG); -} - static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f783d8fe0d1d..2dc24a755b6d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -41,4 +41,28 @@ static inline bool is_protmode(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_PE); } +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.shadow_efer & EFER_LMA; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} + #endif -- cgit v1.2.2 From f6801dff23bd1902473902194667f4ac1eb6ea26 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:50 +0200 Subject: KVM: Rename vcpu->shadow_efer to efer None of the other registers have the shadow_ prefix. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 12 ++++++------ arch/x86/kvm/vmx.c | 14 +++++++------- arch/x86/kvm/x86.c | 14 +++++++------- arch/x86/kvm/x86.h | 2 +- 6 files changed, 23 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7ebf9fe670cd..152233723844 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -277,7 +277,7 @@ struct kvm_vcpu_arch { unsigned long cr8; u32 hflags; u64 pdptrs[4]; /* pae */ - u64 shadow_efer; + u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ int32_t apic_arb_prio; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f7158f4fbfd..599c422c390f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -237,7 +237,7 @@ static int is_cpuid_PSE36(void) static int is_nx(struct kvm_vcpu *vcpu) { - return vcpu->arch.shadow_efer & EFER_NX; + return vcpu->arch.efer & EFER_NX; } static int is_shadow_present_pte(u64 pte) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 800208a60a51..9596cc86d6dd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; } static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -996,14 +996,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } @@ -1361,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + if (!(svm->vcpu.arch.efer & EFER_SVME) || !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1764,7 +1764,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.ds = vmcb->save.ds; hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.shadow_efer; + hsave->save.efer = svm->vcpu.arch.efer; hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2e894954069f..a680d939546f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -613,7 +613,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer; u64 ignore_bits; - guest_efer = vmx->vcpu.arch.shadow_efer; + guest_efer = vmx->vcpu.arch.efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -955,7 +955,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -1600,7 +1600,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) * of this msr depends on is_long_mode(). */ vmx_load_host_state(to_vmx(vcpu)); - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; if (!msr) return; if (efer & EFER_LMA) { @@ -1632,13 +1632,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - vmx_set_efer(vcpu, vcpu->arch.shadow_efer); + vcpu->arch.efer |= EFER_LMA; + vmx_set_efer(vcpu, vcpu->arch.efer); } static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) @@ -1745,7 +1745,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a4a7d1892f72..27af6e353b06 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -456,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { + if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { @@ -655,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) } if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); kvm_inject_gp(vcpu, 0); return; @@ -686,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); @@ -1426,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.shadow_efer; + data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: data = vcpu->kvm->arch.wall_clock; @@ -4569,7 +4569,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->cr3 = vcpu->arch.cr3; sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; + sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); @@ -5059,7 +5059,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2dc24a755b6d..2d101639bd8d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -44,7 +44,7 @@ static inline bool is_protmode(struct kvm_vcpu *vcpu) static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; + return vcpu->arch.efer & EFER_LMA; #else return 0; #endif -- cgit v1.2.2 From 8ae099127668de80e4babba73bb492740ce3a1be Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:51 +0200 Subject: KVM: Optimize kvm_read_cr[04]_bits() 'mask' is always a constant, so we can check whether it includes a bit that might be owned by the guest very cheaply, and avoid the decache call. Saves a few hundred bytes of module text. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/kvm_cache_regs.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 6b419a36cbd9..cff851cf5322 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,6 +1,11 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -40,7 +45,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { - if (mask & vcpu->arch.cr0_guest_owned_bits) + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + if (tmask & vcpu->arch.cr0_guest_owned_bits) kvm_x86_ops->decache_cr0_guest_bits(vcpu); return vcpu->arch.cr0 & mask; } @@ -52,7 +58,8 @@ static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { - if (mask & vcpu->arch.cr4_guest_owned_bits) + ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + if (tmask & vcpu->arch.cr4_guest_owned_bits) kvm_x86_ops->decache_cr4_guest_bits(vcpu); return vcpu->arch.cr4 & mask; } -- cgit v1.2.2 From 0c04851c0c093ce98ab4ca69556480d779292418 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:52 +0200 Subject: KVM: trace guest fpu loads and unloads Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27af6e353b06..3b90298fb980 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5292,6 +5292,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 1; kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); + trace_kvm_fpu(1); } void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) @@ -5304,6 +5305,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + trace_kvm_fpu(0); } void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From d7fa6ab217aeed26293e01d7b64f79a1ac57e823 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Jan 2010 16:55:05 +0800 Subject: KVM: MMU: Remove some useless code from alloc_mmu_pages() If we fail to alloc page for vcpu->arch.mmu.pae_root, call to free_mmu_pages() is unnecessary, which just do free the page malloc for vcpu->arch.mmu.pae_root. Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 599c422c390f..dc4d954efacd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2843,16 +2843,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) */ page = alloc_page(GFP_KERNEL | __GFP_DMA32); if (!page) - goto error_1; + return -ENOMEM; + vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; } int kvm_mmu_create(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 81231c698a71af6e1815df72c06685d295e1cc1d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 24 Jan 2010 16:26:40 +0200 Subject: KVM: VMX: Pass cr0.mp through to the guest when the fpu is active When cr0.mp is clear, the guest doesn't expect a #NM in response to a WAIT instruction. Because we always keep cr0.mp set, it will get a #NM, and potentially be confused. Fix by keeping cr0.mp set only when the fpu is inactive, and passing it through when inactive. Reported-by: Lorenzo Martignoni Analyzed-by: Gleb Natapov Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a680d939546f..7a56879a058c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -66,7 +66,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); #define KVM_GUEST_CR0_MASK \ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_MP) + (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ @@ -791,12 +791,15 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static void vmx_fpu_activate(struct kvm_vcpu *vcpu) { + ulong cr0; + if (vcpu->fpu_active) return; vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (kvm_read_cr0_bits(vcpu, X86_CR0_TS)) - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + cr0 = vmcs_readl(GUEST_CR0); + cr0 &= ~(X86_CR0_TS | X86_CR0_MP); + cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); + vmcs_writel(GUEST_CR0, cr0); update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); @@ -807,7 +810,7 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { vmx_decache_cr0_guest_bits(vcpu); - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits = 0; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); @@ -1757,7 +1760,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); if (!vcpu->fpu_active) - hw_cr0 |= X86_CR0_TS; + hw_cr0 |= X86_CR0_TS | X86_CR0_MP; vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); -- cgit v1.2.2 From e01c2426149d70dc6dd46ad0453195656b6eeaa4 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 25 Jan 2010 12:01:04 +0200 Subject: KVM: mark segments accessed on HW task switch On HW task switch newly loaded segments should me marked as accessed. Reported-by: Lorenzo Martignoni Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b90298fb980..d47ceda7a928 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4697,18 +4697,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) return kvm_seg.selector; } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { @@ -4749,11 +4737,14 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; + struct desc_struct seg_desc; if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) + + if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) return 1; + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); kvm_check_segment_descriptor(vcpu, seg, selector); kvm_seg.type |= type_bits; @@ -4764,6 +4755,11 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_seg.unusable = 1; kvm_set_segment(vcpu, &kvm_seg, seg); + if (selector && !kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } return 0; } -- cgit v1.2.2 From 6e7d15296773f7a479072ec7f3248f51d01e1fad Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 25 Jan 2010 19:36:03 +0200 Subject: KVM: Fix msr trace - data is 64 bits wide, not unsigned long - rw is confusingly named Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/trace.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 1cb3d0e990f3..45903a926372 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -246,23 +246,23 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), - TP_ARGS(rw, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data), + TP_ARGS(write, ecx, data), TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, ecx ) - __field( unsigned long, data ) + __field( unsigned, write ) + __field( u32, ecx ) + __field( u64, data ) ), TP_fast_assign( - __entry->rw = rw; + __entry->write = write; __entry->ecx = ecx; __entry->data = data; ), - TP_printk("msr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", + TP_printk("msr_%s %x = 0x%llx", + __entry->write ? "write" : "read", __entry->ecx, __entry->data) ); -- cgit v1.2.2 From 59200273c4d07ddf99dd3c8d91b90bb170457fc3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 25 Jan 2010 19:47:02 +0200 Subject: KVM: Trace failed msr reads and writes Record failed msrs reads and writes, and the fact that they failed as well. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 13 ++++++++----- arch/x86/kvm/trace.h | 17 +++++++++++------ arch/x86/kvm/vmx.c | 5 +++-- 3 files changed, 22 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9596cc86d6dd..52f78dd03010 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2177,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm) u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(&svm->vcpu, ecx, &data)) + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); - else { + } else { trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2271,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); - else + } else { + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); + } return 1; } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 45903a926372..6ad30a29f044 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -246,28 +246,33 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned write, u32 ecx, u64 data), - TP_ARGS(write, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), + TP_ARGS(write, ecx, data, exception), TP_STRUCT__entry( __field( unsigned, write ) __field( u32, ecx ) __field( u64, data ) + __field( u8, exception ) ), TP_fast_assign( __entry->write = write; __entry->ecx = ecx; __entry->data = data; + __entry->exception = exception; ), - TP_printk("msr_%s %x = 0x%llx", + TP_printk("msr_%s %x = 0x%llx%s", __entry->write ? "write" : "read", - __entry->ecx, __entry->data) + __entry->ecx, __entry->data, + __entry->exception ? " (#GP)" : "") ); -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) /* * Tracepoint for guest CR access. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7a56879a058c..334b016267a9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3158,6 +3158,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } @@ -3177,13 +3178,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } -- cgit v1.2.2 From 6d3e435e708fa32f99b780caf6c5508a606a513b Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Fri, 29 Jan 2010 15:36:59 +0800 Subject: KVM: VMX: Remove redundant check in vm_need_virtualize_apic_accesses() flexpriority_enabled implies cpu_has_vmx_virtualize_apic_accesses() returning true, so we don't need this check here. Signed-off-by: Gui Jianfeng Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 334b016267a9..f083b64814d8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -358,9 +358,7 @@ static inline int cpu_has_vmx_ple(void) static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return flexpriority_enabled && - (cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm)); + return flexpriority_enabled && irqchip_in_kernel(kvm); } static inline int cpu_has_vmx_vpid(void) -- cgit v1.2.2 From ab9f4ecbb6d39a18e300a0d10a4968c37404aa76 Mon Sep 17 00:00:00 2001 From: "Zhai, Edwin" Date: Fri, 29 Jan 2010 14:38:44 +0800 Subject: KVM: enable PCI multiple-segments for pass-through device Enable optional parameter (default 0) - PCI segment (or domain) besides BDF, when assigning PCI device to guest. Signed-off-by: Zhai Edwin Acked-by: Chris Wright Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d47ceda7a928..0bf3df527afc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1569,6 +1569,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_PCI_SEGMENT: r = 1; break; case KVM_CAP_COALESCED_MMIO: -- cgit v1.2.2 From c125c607326928001983f51109c3d0c190f0a8bb Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 1 Feb 2010 22:11:04 +0900 Subject: KVM: fix load_guest_segment_descriptor() to return X86EMUL_* This patch fixes load_guest_segment_descriptor() to return X86EMUL_PROPAGATE_FAULT when it tries to access the descriptor table beyond the limit of it: suggested by Marcelo. I have checked current callers of this helper function, - kvm_load_segment_descriptor() - kvm_task_switch() and confirmed that this patch will change nothing in the upper layers if we do not change the handling of this return value from load_guest_segment_descriptor(). Next step: Although fixing the kvm_task_switch() to handle the propagated faults properly seems difficult, and maybe not worth it because TSS is not used commonly these days, we can fix kvm_load_segment_descriptor(). By doing so, the injected #GP becomes possible to be handled by the guest. The only problem for this is how to differentiate this fault from the page faults generated by kvm_read_guest_virt(). We may have to split this function to achive this goal. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0bf3df527afc..01f0b037092e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4663,7 +4663,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return 1; + return X86EMUL_PROPAGATE_FAULT; } return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } -- cgit v1.2.2 From 7edcface95c6e593faa40c70e0464500515db573 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 1 Feb 2010 22:11:52 +0900 Subject: KVM: fix kvm_fix_hypercall() to return X86EMUL_* This patch fixes kvm_fix_hypercall() to propagate X86EMUL_* info generated by emulator_write_emulated() to its callers: suggested by Marcelo. The effect of this is x86_emulate_insn() will begin to handle the page faults which occur in emulator_write_emulated(): this should be OK because emulator_write_emulated_onepage() always injects page fault when emulator_write_emulated() returns X86EMUL_PROPAGATE_FAULT. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01f0b037092e..c91007f81660 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3888,10 +3888,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; - int ret = 0; unsigned long rip = kvm_rip_read(vcpu); - /* * Blow out the MMU to ensure that no other VCPU has an active mapping * to ensure that the updated hypercall appears atomically across all @@ -3900,11 +3898,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_mmu_zap_all(vcpu->kvm); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; - return ret; + return emulator_write_emulated(rip, instruction, 3, vcpu); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) -- cgit v1.2.2 From ebcbab4c034db2ec25abe702d788936b29a49b24 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 7 Feb 2010 11:56:52 +0200 Subject: KVM: VMX: Wire up .fpu_activate() callback Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f083b64814d8..82fb810afd3f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4137,6 +4137,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, -- cgit v1.2.2 From c45b4fd416f5497b6b38dd70acc0e5b01399e5c9 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 6 Feb 2010 09:43:03 +0100 Subject: KVM: VMX: Remove redundant test in vmx_set_efer() msr was tested above, so the second test is not needed. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r@ expression *x; expression e; identifier l; @@ if (x == NULL || ...) { ... when forall return ...; } ... when != goto l; when != x = e when != &x *x == NULL // Signed-off-by: Julia Lawall Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 82fb810afd3f..b400be06c8cd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1602,8 +1602,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) */ vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.efer = efer; - if (!msr) - return; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | -- cgit v1.2.2 From 8f0b1ab6fb045a1324d9435ba00c2940783b0041 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 28 Jan 2010 12:37:56 +0100 Subject: KVM: Introduce kvm_host_page_size This patch introduces a generic function to find out the host page size for a given gfn. This function is needed by the kvm iommu code. This patch also simplifies the x86 host_mapping_level function. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dc4d954efacd..913ef4b7939a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -468,24 +468,10 @@ static int has_wrprotected_page(struct kvm *kvm, static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { - unsigned long page_size = PAGE_SIZE; - struct vm_area_struct *vma; - unsigned long addr; + unsigned long page_size; int i, ret = 0; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return PT_PAGE_TABLE_LEVEL; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - page_size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); + page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { -- cgit v1.2.2 From 6316e1c8c6af6ccb55ff8564231710660608f46c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 3 Feb 2010 16:11:03 -0500 Subject: KVM: VMX: emulate accessed bit for EPT Currently KVM pretends that pages with EPT mappings never got accessed. This has some side effects in the VM, like swapping out actively used guest pages and needlessly breaking up actively used hugepages. We can avoid those very costly side effects by emulating the accessed bit for EPT PTEs, which should only be slightly costly because pages pass through page_referenced infrequently. TLB flushing is taken care of by kvm_mmu_notifier_clear_flush_young(). This seems to help prevent KVM guests from being swapped out when they should not on my system. Signed-off-by: Rik van Riel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 913ef4b7939a..b8da6715d08b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -838,9 +838,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int young = 0; - /* always return old for EPT */ + /* + * Emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ if (!shadow_accessed_mask) - return 0; + return kvm_unmap_rmapp(kvm, rmapp, data); spte = rmap_next(kvm, rmapp, NULL); while (spte) { -- cgit v1.2.2 From 1976d2d2c91246a37fcb8246b811de735aa6e9a4 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 5 Feb 2010 17:52:46 +0900 Subject: KVM: Remove redundant reading of rax on OUT instructions kvm_emulate_pio() and complete_pio() both read out the RAX register value and copy it to a place into which the value read out from the port will be copied later. This patch removes this redundancy. /*** snippet from arch/x86/kvm/x86.c ***/ int complete_pio(struct kvm_vcpu *vcpu) { ... if (!io->string) { if (io->in) { val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(&val, vcpu->arch.pio_data, io->size); kvm_register_write(vcpu, VCPU_REGS_RAX, val); } ... Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c91007f81660..bd3161c6daed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3551,8 +3551,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, size, 1); - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); -- cgit v1.2.2 From 90bb6fc556ab255abd798bcf4ff5769690ab2eea Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 Dec 2009 12:10:16 +0200 Subject: KVM: MMU: Add tracepoint for guest page aging Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b8da6715d08b..739793240d1d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -151,6 +151,9 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#include + +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -792,6 +795,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data)) { int i, j; + int ret; int retval = 0; struct kvm_memslots *slots; @@ -806,16 +810,17 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset], - data); + ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); - retval |= handler(kvm, + ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); } + trace_kvm_age_page(hva, memslot, ret); + retval |= ret; } } -- cgit v1.2.2 From a19a6d1131c77822ae55f84a1bd6d62327e99287 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 9 Feb 2010 16:41:53 +0800 Subject: KVM: VMX: Rename VMX_EPT_IGMT_BIT to VMX_EPT_IPAT_BIT Following the new SDM. Now the bit is named "Ignore PAT memory type". Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/kvm/vmx.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 43f1e9b45917..fb9a080740ec 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -377,7 +377,7 @@ enum vmcs_field { #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull -#define VMX_EPT_IGMT_BIT (1ull << 6) +#define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b400be06c8cd..f82b0723afa5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4001,7 +4001,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ if (is_mmio) @@ -4012,7 +4012,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) VMX_EPT_MT_EPTE_SHIFT; else ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IGMT_BIT; + | VMX_EPT_IPAT_BIT; return ret; } -- cgit v1.2.2 From d225f53b76f4287365efd381b9e3a732e6bad27e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Feb 2010 17:03:51 +0800 Subject: KVM: PIT: unregister kvm irq notifier if fail to create pit If fail to create pit, we should unregister kvm irq notifier which register in kvm_create_pit(). Signed-off-by: Wei Yongjun Acked-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 6a74246f80c6..c9569f20dfba 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -663,8 +663,9 @@ fail_unregister: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: - if (pit->irq_source_id >= 0) - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; -- cgit v1.2.2 From 72bb2fcd23afe8db53b47e8f9edd736c517ba532 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 9 Feb 2010 10:33:03 +0800 Subject: KVM: cleanup the failure path of KVM_CREATE_IRQCHIP ioctrl If we fail to init ioapic device or the fail to setup the default irq routing, the device register by kvm_create_pic() and kvm_ioapic_init() remain unregister. This patch fixed to do this. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 11 +++++++++++ arch/x86/kvm/irq.h | 1 + arch/x86/kvm/x86.c | 8 ++++---- 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d5753a75d58c..a3711f9e580f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -543,3 +543,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; } + +void kvm_destroy_pic(struct kvm *kvm) +{ + struct kvm_pic *vpic = kvm->arch.vpic; + + if (vpic) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm->arch.vpic = NULL; + kfree(vpic); + } +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e207d57..0b71d480ebf1 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -75,6 +75,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd3161c6daed..b2f91b9af00d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2771,6 +2771,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev); kfree(vpic); goto create_irqchip_unlock; } @@ -2782,10 +2784,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->irq_lock); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm->arch.vpic = NULL; - kvm->arch.vioapic = NULL; + kvm_ioapic_destroy(kvm); + kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); } create_irqchip_unlock: -- cgit v1.2.2 From 2db2c2eb6226e30f8059b82512a1364db98da8e3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:29 +0200 Subject: KVM: x86 emulator: Add group8 instruction decoding Use groups mechanism to decode 0F BA instructions. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 645b245a3c23..435b1e4e8c9b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -88,6 +88,7 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, }; static u32 opcode_table[256] = { @@ -267,7 +268,7 @@ static u32 twobyte_table[256] = { 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, Group | Group8, DstMem | SrcReg | ModRM | BitOp, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ @@ -323,6 +324,10 @@ static u32 group_table[] = { 0, 0, ModRM | SrcMem, ModRM | SrcMem, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, }; static u32 group2_table[] = { -- cgit v1.2.2 From 60a29d4ea4e7b6b95d9391ebc8625b0426f3a363 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:30 +0200 Subject: KVM: x86 emulator: Add group9 instruction decoding Use groups mechanism to decode 0F C7 instructions. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 435b1e4e8c9b..45a4f7c1bb0b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -88,7 +88,7 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, - Group8, + Group8, Group9, }; static u32 opcode_table[256] = { @@ -272,7 +272,8 @@ static u32 twobyte_table[256] = { 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -328,6 +329,8 @@ static u32 group_table[] = { 0, 0, 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + [Group9*8] = + 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { @@ -335,6 +338,8 @@ static u32 group2_table[] = { SrcNone | ModRM, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ -- cgit v1.2.2 From a0044755679f3e761b8b95995e5f2db2b7efd0f6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:31 +0200 Subject: KVM: x86 emulator: Add Virtual-8086 mode of emulation For some instructions CPU behaves differently for real-mode and virtual 8086. Let emulator know which mode cpu is in, so it will not poke into vcpu state directly. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 12 +++++++----- arch/x86/kvm/x86.c | 3 ++- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 9b697c2735d9..784d7c586d8e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -168,6 +168,7 @@ struct x86_emulate_ctxt { /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 45a4f7c1bb0b..e4e2df3b6038 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -899,6 +899,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -1525,7 +1526,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) /* syscall is not available in real mode */ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !is_protmode(ctxt->vcpu)) + || ctxt->mode == X86EMUL_MODE_VM86) return -1; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1577,8 +1578,8 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) if (c->lock_prefix) return -1; - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || !is_protmode(ctxt->vcpu)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } @@ -1642,8 +1643,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) if (c->lock_prefix) return -1; - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || !is_protmode(ctxt->vcpu)) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b2f91b9af00d..a28379507d30 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3348,8 +3348,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; -- cgit v1.2.2 From 1871c6020d7308afb99127bba51f04548e7ca84e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:32 +0200 Subject: KVM: x86 emulator: fix memory access during x86 emulation Currently when x86 emulator needs to access memory, page walk is done with broadest permission possible, so if emulated instruction was executed by userspace process it can still access kernel memory. Fix that by providing correct memory access to page walker during emulation. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 14 +++- arch/x86/include/asm/kvm_host.h | 7 +- arch/x86/kvm/emulate.c | 6 +- arch/x86/kvm/mmu.c | 17 ++--- arch/x86/kvm/mmu.h | 6 ++ arch/x86/kvm/paging_tmpl.h | 11 +++- arch/x86/kvm/x86.c | 131 ++++++++++++++++++++++++++++--------- 7 files changed, 142 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 784d7c586d8e..7a6f54fa13ba 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -54,13 +54,23 @@ struct x86_emulate_ctxt; struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. + * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 152233723844..c07c16f64015 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -243,7 +243,8 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -660,6 +661,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e4e2df3b6038..c44b46014842 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -616,7 +616,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, if (linear < fc->start || linear >= fc->end) { size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); if (rc) return rc; fc->start = linear; @@ -671,11 +671,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); + ctxt->vcpu, NULL); if (rc) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 739793240d1d..741373e8ca77 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -138,12 +138,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 @@ -1632,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); if (gpa == UNMAPPED_GVA) return NULL; @@ -2155,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2740,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -3237,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61ef5a65b7d8..be66759321a5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -42,6 +42,12 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index df15a5307d2d..81eab9a50e6a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a28379507d30..ea3a8af8a478 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3039,14 +3039,41 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3069,14 +3096,37 @@ out: return r; } +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3106,6 +3156,7 @@ static int emulator_read_emulated(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3115,17 +3166,20 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -3164,11 +3218,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -3232,7 +3287,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, char *kaddr; u64 val; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3297,7 +3352,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -3305,7 +3360,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -3442,12 +3498,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu) gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; int ret; + u32 error_code; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + return ret; } @@ -3468,7 +3529,7 @@ int complete_pio(struct kvm_vcpu *vcpu) if (io->in) { r = pio_copy_data(vcpu); if (r) - return r; + goto out; } delta = 1; @@ -3495,7 +3556,7 @@ int complete_pio(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RSI, val); } } - +out: io->count -= io->cur_count; io->cur_count = 0; @@ -3617,10 +3678,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); + if (ret == X86EMUL_PROPAGATE_FAULT) return 1; - } if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -4663,7 +4722,9 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_read_guest_virt_system(dtable.base + index*8, + seg_desc, sizeof(*seg_desc), + vcpu, NULL); } /* allowed just for 8 bytes segments */ @@ -4677,15 +4738,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} + +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc) +{ + u32 base_addr = get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); } -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { u32 base_addr = get_desc_base(seg_desc); - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4894,7 +4963,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_16)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_16, sizeof tss_segment_16)) goto out; @@ -4902,7 +4971,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_16.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_16.prev_task_link, sizeof tss_segment_16.prev_task_link)) goto out; @@ -4933,7 +5002,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_32)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_32, sizeof tss_segment_32)) goto out; @@ -4941,7 +5010,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_32.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_32.prev_task_link, sizeof tss_segment_32.prev_task_link)) goto out; @@ -4964,7 +5033,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); /* FIXME: Handle errors. Failure to read either TSS or their * descriptors should generate a pagefault. @@ -5199,7 +5268,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; -- cgit v1.2.2 From f850e2e603bf5a05b0aee7901857cf85715aa694 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:33 +0200 Subject: KVM: x86 emulator: Check IOPL level during io instruction emulation Make emulator check that vcpu is allowed to execute IN, INS, OUT, OUTS, CLI, STI. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/emulate.c | 89 +++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/x86.c | 10 ++--- 3 files changed, 87 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c07c16f64015..f9a2f66530cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -678,6 +678,7 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c44b46014842..296e8519dc53 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1698,6 +1698,57 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) return 0; } +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1889,7 +1940,12 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1905,6 +1961,11 @@ special_insn: return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } if (kvm_emulate_pio_string(ctxt->vcpu, 0, (c->d & ByteOp) ? 1 : c->op_bytes, @@ -2202,7 +2263,13 @@ special_insn: case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; @@ -2227,13 +2294,21 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea3a8af8a478..86b739f8f173 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3599,6 +3599,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { unsigned long val; + trace_kvm_pio(!in, port, size, 1); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3610,9 +3612,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); - if (!vcpu->arch.pio.in) { val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); @@ -3633,6 +3632,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, unsigned now, in_page; int ret = 0; + trace_kvm_pio(!in, port, size, count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3644,9 +3645,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; -- cgit v1.2.2 From d4c6a1549c056f1d817e8f6f2f97d8b44933472f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:34 +0200 Subject: KVM: x86 emulator: Fix popf emulation POPF behaves differently depending on current CPU mode. Emulate correct logic to prevent guest from changing flags that it can't change otherwise. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 296e8519dc53..1782387c069e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -343,11 +343,18 @@ static u32 group2_table[] = { }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -1214,6 +1221,49 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, return rc; } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; @@ -2099,7 +2149,10 @@ special_insn: c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; -- cgit v1.2.2 From e92805ac1228626c59c865f2f4e9059b9fb8c97b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:35 +0200 Subject: KVM: x86 emulator: Check CPL level during privilege instruction emulation Add CPL checking in case emulator is tricked into emulating privilege instruction from userspace. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1782387c069e..d63211169ac1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -76,6 +76,7 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) @@ -211,7 +212,7 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -219,16 +220,20 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -322,9 +327,9 @@ static u32 group_table[] = { SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, [Group8*8] = 0, 0, 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, @@ -335,7 +340,7 @@ static u32 group_table[] = { static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, [Group9*8] = @@ -1700,12 +1705,6 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) return -1; } - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - setup_syscalls_segments(ctxt, &cs, &ss); if ((c->rex_prefix & 0x8) != 0x0) @@ -1820,6 +1819,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) memop = c->modrm_ea; -- cgit v1.2.2 From d380a5e4022a5c63a6c5ac631e48c752ba7f1e4f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:36 +0200 Subject: KVM: x86 emulator: Add LOCK prefix validity checking Instructions which are not allowed to have LOCK prefix should generate #UD if one is used. [avi: fold opcode 82 fix from another patch] Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 97 +++++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d63211169ac1..c2de9f03da84 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -76,6 +76,7 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ @@ -94,35 +95,35 @@ enum { static u32 opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x38 - 0x3F */ @@ -158,7 +159,7 @@ static u32 opcode_table[256] = { Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -263,17 +264,18 @@ static u32 twobyte_table[256] = { DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, Group | Group8, DstMem | SrcReg | ModRM | BitOp, + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ @@ -290,25 +292,41 @@ static u32 twobyte_table[256] = { static u32 group_table[] = { [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = @@ -332,10 +350,10 @@ static u32 group_table[] = { SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, [Group8*8] = 0, 0, 0, 0, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, [Group9*8] = - 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, + 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { @@ -1580,8 +1598,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) u64 msr_data; /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || ctxt->mode == X86EMUL_MODE_VM86) + if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) return -1; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1629,10 +1646,6 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) struct kvm_segment cs, ss; u64 msr_data; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); @@ -1694,10 +1707,6 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) u64 msr_data; int usermode; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { @@ -1819,6 +1828,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && !(c->d & Lock)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { kvm_inject_gp(ctxt->vcpu, 0); -- cgit v1.2.2 From 1d327eac3c9a8b170d68df44957d82bdbd263c53 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 11 Feb 2010 11:12:07 +0800 Subject: KVM: x86 emulator: code style cleanup Just remove redundant semicolon. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c2de9f03da84..dd1b93522379 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1016,7 +1016,7 @@ done_prefixes: } if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; + kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction"); return -1; } -- cgit v1.2.2 From e424e1918345ab17d3beb068b8745ab1706d0e1b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 11 Feb 2010 12:41:10 +0200 Subject: KVM: x86 emulator: disallow opcode 82 in 64-bit mode Instructions with opcode 82 are not valid in 64 bit mode. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index dd1b93522379..c280c2352f7e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -310,14 +310,14 @@ static u32 group_table[] = { DstMem | SrcImm | ModRM | Lock, DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, [Group1_83*8] = DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, -- cgit v1.2.2 From fa8273e9549b3cee2947d1aebe5242e7a25eb3de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Feb 2010 14:00:41 +0000 Subject: KVM: Convert i8254/i8259 locks to raw_spinlocks The i8254/i8259 locks need to be real spinlocks on preempt-rt. Convert them to raw_spinlock. No change for !RT kernels. Signed-off-by: Thomas Gleixner Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 10 +++++----- arch/x86/kvm/i8254.h | 2 +- arch/x86/kvm/i8259.c | 31 ++++++++++++++++--------------- arch/x86/kvm/irq.h | 2 +- arch/x86/kvm/x86.c | 8 ++++---- 5 files changed, 27 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c9569f20dfba..294698b6daff 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -724,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7ffdc09..900d6b0ba7c2 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index a3711f9e580f..07771da85de5 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - spin_unlock(&s->pics_state->lock); + raw_spin_unlock(&s->pics_state->lock); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - spin_lock(&s->pics_state->lock); + raw_spin_lock(&s->pics_state->lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + + raw_spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } /* @@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + raw_spin_lock(&s->lock); pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + raw_spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return ret; } @@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + raw_spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return intno; } @@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 0b71d480ebf1..34b15915754d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + raw_spinlock_t lock; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 86b739f8f173..f6ae4875d9d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2542,18 +2542,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); -- cgit v1.2.2 From 8b9f44140bc4afd2698413cd9960c3912168ee91 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 18 Feb 2010 12:14:59 +0200 Subject: KVM: x86 emulator: Forbid modifying CS segment register by mov instruction Inject #UD if guest attempts to do so. This is in accordance to Intel SDM. Cc: stable@kernel.org (2.6.33, 2.6.32) Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c280c2352f7e..2db760ff887c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2126,6 +2126,12 @@ special_insn: int err; sel = c->src.val; + + if (c->modrm_reg == VCPU_SREG_CS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + if (c->modrm_reg == VCPU_SREG_SS) toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); -- cgit v1.2.2 From 6f550484a15ea1b468665cdf59f020bf08ccb292 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 18 Feb 2010 12:15:00 +0200 Subject: KVM: Fix load_guest_segment_descriptor() to inject page fault This patch injects page fault when reading descriptor in load_guest_segment_descriptor() fails with FAULT. Effects of this injection: This function is used by kvm_load_segment_descriptor() which is necessary for the following instructions: - mov seg,r/m16 - jmp far - pop ?s This patch makes it possible to emulate the page faults generated by these instructions. But be sure that unless we change the kvm_load_segment_descriptor()'s ret value propagation this patch has no effect. Signed-off-by: Takuya Yoshikawa Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f6ae4875d9d0..203ee7d0ed58 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4713,6 +4713,9 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct descriptor_table dtable; u16 index = selector >> 3; + int ret; + u32 err; + gva_t addr; get_segment_descriptor_dtable(vcpu, selector, &dtable); @@ -4720,9 +4723,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } - return kvm_read_guest_virt_system(dtable.base + index*8, - seg_desc, sizeof(*seg_desc), - vcpu, NULL); + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + + return ret; } /* allowed just for 8 bytes segments */ -- cgit v1.2.2 From c697518a861e6c43b92b848895f9926580ee63c3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 18 Feb 2010 12:15:01 +0200 Subject: KVM: Fix segment descriptor loading Add proper error and permission checking. This patch also change task switching code to load segment selectors before segment descriptors, like SDM requires, otherwise permission checking during segment descriptor loading will be incorrect. Cc: stable@kernel.org (2.6.33, 2.6.32) Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 +- arch/x86/kvm/emulate.c | 30 ++----- arch/x86/kvm/x86.c | 177 ++++++++++++++++++++++++++++++++-------- 3 files changed, 151 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f9a2f66530cf..06d9e79ca37d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -614,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2db760ff887c..a1a7b27adf41 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1309,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, if (rc != 0) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); return rc; } @@ -1491,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); if (rc) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } @@ -2122,12 +2122,11 @@ special_insn: break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_CS) { + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } @@ -2135,18 +2134,7 @@ special_insn: if (c->modrm_reg == VCPU_SREG_SS) toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; - } - - if (err < 0) - goto cannot_emulate; + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -2320,11 +2308,9 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 203ee7d0ed58..c3d2acbbb91b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4787,7 +4787,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se .unusable = 0, }; kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; + return X86EMUL_CONTINUE; } static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4797,43 +4797,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } -static void kvm_check_segment_descriptor(struct kvm_vcpu *vcpu, int seg, - u16 selector) -{ - /* NULL selector is not valid for CS and SS */ - if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) - if (!selector) - kvm_queue_exception_e(vcpu, TS_VECTOR, selector >> 3); -} - -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment kvm_seg; struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); - kvm_check_segment_descriptor(vcpu, seg, selector); - kvm_seg.type |= type_bits; + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; - kvm_set_segment(vcpu, &kvm_seg, seg); - if (selector && !kvm_seg.unusable && kvm_seg.s) { + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { /* mark segment as accessed */ + kvm_seg.type |= 1; seg_desc.type |= 1; save_guest_segment_descriptor(vcpu, selector, &seg_desc); } - return 0; +load: + kvm_set_segment(vcpu, &kvm_seg, seg); + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; } static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4859,6 +4928,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + static int load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { @@ -4876,25 +4953,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) return 1; return 0; } @@ -4934,19 +5027,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; return 0; } -- cgit v1.2.2 From e54cfa97a9ca9a544a7257b89b530b505ae1b892 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 18 Feb 2010 12:15:02 +0200 Subject: KVM: Fix emulate_sys[call, enter, exit]()'s fault handling This patch fixes emulate_syscall(), emulate_sysenter() and emulate_sysexit() to handle injected faults properly. Even though original code injects faults in these functions, we cannot handle these unless we use the different return value from the UNHANDLEABLE case. So this patch use X86EMUL_* codes instead of -1 and 0 and makes x86_emulate_insn() to handle these propagated faults. Be sure that, in x86_emulate_insn(), goto cannot_emulate and goto done with rc equals X86EMUL_UNHANDLEABLE have same effect. Signed-off-by: Takuya Yoshikawa Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a1a7b27adf41..4dade6ac0827 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1599,7 +1599,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1636,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } - return 0; + return X86EMUL_CONTINUE; } static int @@ -1649,14 +1649,14 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1665,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; } @@ -1696,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; - return 0; + return X86EMUL_CONTINUE; } static int @@ -1711,7 +1711,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1729,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = (u16)(msr_data + 24); break; @@ -1737,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = cs.selector + 8; cs.db = 0; @@ -1753,7 +1753,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - return 0; + return X86EMUL_CONTINUE; } static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) @@ -2476,8 +2476,9 @@ twobyte_insn: } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; + rc = emulate_syscall(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2548,14 +2549,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; -- cgit v1.2.2 From c573cd22939e54fc1b8e672054a505048987a7cb Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 23 Feb 2010 17:47:53 +0100 Subject: KVM: VMX: Update instruction length on intercepted BP We intercept #BP while in guest debugging mode. As VM exits due to intercepted exceptions do not necessarily come with valid idt_vectoring, we have to update event_exit_inst_len explicitly in such cases. At least in the absence of migration, this ensures that re-injections of #BP will find and use the correct instruction length. Signed-off-by: Jan Kiszka Cc: stable@kernel.org (2.6.32, 2.6.33) Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f82b0723afa5..14873b9f8430 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2775,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2897,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; -- cgit v1.2.2 From d2be1651b736002e0c76d7095d6c0ba77b4a897c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 23 Feb 2010 17:47:57 +0100 Subject: KVM: x86: Add KVM_CAP_X86_ROBUST_SINGLESTEP This marks the guest single-step API improvement of 94fe45da and 91586a3b with a capability flag to allow reliable detection by user space. Signed-off-by: Jan Kiszka Cc: stable@kernel.org (2.6.33) Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c3d2acbbb91b..e46282a56565 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1570,6 +1570,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_X86_ROBUST_SINGLESTEP: r = 1; break; case KVM_CAP_COALESCED_MMIO: -- cgit v1.2.2 From 14be1f7454ea96ee614467a49cf018a1a383b189 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Mon, 1 Mar 2010 11:48:15 -0600 Subject: x86: Fix sched_clock_cpu for systems with unsynchronized TSC On UV systems, the TSC is not synchronized across blades. The sched_clock_cpu() function is returning values that can go backwards (I've seen as much as 8 seconds) when switching between cpus. As each cpu comes up, early_init_intel() will currently set the sched_clock_stable flag true. When mark_tsc_unstable() runs, it clears the flag, but this only occurs once (the first time a cpu comes up whose TSC is not synchronized with cpu 0). After this, early_init_intel() will set the flag again as the next cpu comes up. Only set sched_clock_stable if tsc has not been marked unstable. Signed-off-by: Dimitri Sivanich Acked-by: Venkatesh Pallipadi Acked-by: Peter Zijlstra LKML-Reference: <20100301174815.GC8224@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 879666f4d871..7e1cca13af35 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - sched_clock_stable = 1; + if (!check_tsc_unstable()) + sched_clock_stable = 1; } /* -- cgit v1.2.2 From 320ebf09cbb6d01954c9a060266aa8e0d27f4638 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 12:35:37 +0100 Subject: perf, x86: Restrict the ANY flag The ANY flag can show SMT data of another task (like 'top'), so we want to disable it when system-wide profiling is disabled. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6531b4bdb22d..aab2e1ce9dee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -503,6 +503,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); + if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && + perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; return 0; } -- cgit v1.2.2 From b622d644c7d61a5cb95b74e7b143c263bed21f0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Feb 2010 15:36:30 +0100 Subject: perf_events, x86: Fixup fixed counter constraints Patch 1da53e0230 ("perf_events, x86: Improve x86 event scheduling") lost us one of the fixed purpose counters and then ed8777fc13 ("perf_events, x86: Fix event constraint masks") broke it even further. Widen the fixed event mask to event+umask and specify the full config for each of the 3 fixed purpose counters. Then let the init code fill out the placement for the GP regs based on the cpuid info. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 +- arch/x86/kernel/cpu/perf_event.c | 25 ++++++++++++++++++------- arch/x86/kernel/cpu/perf_event_intel.c | 31 +++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 80e693684f18..db6109a885a7 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -50,7 +50,7 @@ INTEL_ARCH_INV_MASK| \ INTEL_ARCH_EDGE_MASK|\ INTEL_ARCH_UNIT_MASK|\ - INTEL_ARCH_EVTSEL_MASK) + INTEL_ARCH_EVENT_MASK) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index aab2e1ce9dee..bfc43fa208bc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -73,10 +73,10 @@ struct debug_store { struct event_constraint { union { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 idxmsk64[1]; + u64 idxmsk64; }; - int code; - int cmask; + u64 code; + u64 cmask; int weight; }; @@ -103,7 +103,7 @@ struct cpu_hw_events { }; #define __EVENT_CONSTRAINT(c, n, m, w) {\ - { .idxmsk64[0] = (n) }, \ + { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ @@ -116,7 +116,7 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) #define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) + EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) @@ -615,8 +615,8 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) bitmap_zero(used_mask, X86_PMC_IDX_MAX); for (i = 0; i < n; i++) { - constraints[i] = - x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + constraints[i] = c; } /* @@ -1350,6 +1350,7 @@ static void __init pmu_check_apic(void) void __init init_hw_perf_events(void) { + struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1398,6 +1399,16 @@ void __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0, x86_pmu.num_events); + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != INTEL_ARCH_FIXED_MASK) + continue; + + c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; + c->weight += x86_pmu.num_events; + } + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590cf4a5f..4fbdfe5708d9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,7 +1,7 @@ #ifdef CONFIG_CPU_SUP_INTEL /* - * Intel PerfMon v3. Used on Core2 and later. + * Intel PerfMon, used on Core and later. */ static const u64 intel_perfmon_event_map[] = { @@ -27,8 +27,14 @@ static struct event_constraint intel_core_event_constraints[] = static struct event_constraint intel_core2_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* + * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event + * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed + * ratio between these counters. + */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -37,14 +43,16 @@ static struct event_constraint intel_core2_event_constraints[] = INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_nehalem_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -58,8 +66,9 @@ static struct event_constraint intel_nehalem_event_constraints[] = static struct event_constraint intel_westmere_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -68,8 +77,9 @@ static struct event_constraint intel_westmere_event_constraints[] = static struct event_constraint intel_gen_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; @@ -935,7 +945,7 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; - case 28: + case 28: /* Atom */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -951,6 +961,7 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_westmere_event_constraints; pr_cont("Westmere events, "); break; + default: /* * default constraints for v2 and up -- cgit v1.2.2 From d8111cd91abee016d62b401e057fee66ba80be67 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 2 Mar 2010 09:19:28 -0800 Subject: x86, mrst: Remove X86_MRST dependency on PCI_IOAPIC PCI_IOAPIC is used for PCI hotplug, Moorestown does not have ACPI PCI hotplug, as it does not have ACPI. This unnecessary dependency causes X86_MRST fail to be selected if ACPI is not selected. Signed-off-by: Jacob Pan LKML-Reference: <1267550368-7435-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2697fdb25ac2..d22686f69de8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -390,7 +390,6 @@ config X86_MRST bool "Moorestown MID platform" depends on PCI depends on PCI_GOANY - depends on PCI_IOAPIC depends on X86_32 depends on X86_EXTENDED_PLATFORM depends on X86_IO_APIC -- cgit v1.2.2 From 29044ad1509ecc229f1d5a31aeed7a8dc61a71c4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Mar 2010 02:25:22 +0100 Subject: x86/stacktrace: Don't dereference bad frame pointers Callers of a stacktrace might pass bad frame pointers. Those are usually checked for safety in stack walking helpers before any dereferencing, but this is not the case when we need to go through one more frame pointer that backlinks the irq stack to the previous one, as we don't have any reliable address boudaries to compare this frame pointer against. This raises crashes when we record callchains for ftrace events with perf because we don't use the right helpers to capture registers there. We get wrong frame pointers as we call task_pt_regs() even on kernel threads, which is a wrong thing as it gives us the initial state of any kernel threads freshly created. This is even not what we want for user tasks. What we want is a hot snapshot of registers when the ftrace event triggers, not the state before a task entered the kernel. This requires more thoughts to do it correctly though. So first put a guardian to ensure the given frame pointer can be dereferenced to avoid crashes. We'll think about how to fix the callers in a subsequent patch. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Cc: 2.6.33.x Cc: Arnaldo Carvalho de Melo --- arch/x86/kernel/dumpstack_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f5..a6c906c9b193 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -125,9 +125,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, { #ifdef CONFIG_FRAME_POINTER struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long next; - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) - return (unsigned long)frame->next_frame; + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { + if (!probe_kernel_address(&frame->next_frame, next)) + return next; + else + WARN_ONCE(1, "Perf: bad frame pointer = %p in " + "callchain\n", &frame->next_frame); + } #endif return bp; } -- cgit v1.2.2 From 3010673ef5f7bef4b4685566a0713de1b4306c93 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 2 Mar 2010 21:01:34 -0800 Subject: x86, mrst: Fix APB timer per cpu clockevent The current APB timer code incorrectly registers a static copy of the clockevent device for the boot CPU. The per cpu clockevent should be used instead. This bug was hidden by zero-initialized data; as such it did not get exposed in testing, but was discovered by code review. Signed-off-by: Jacob Pan LKML-Reference: <1267592494-7723-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apb_timer.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 83a345b0256c..6f27f8b75795 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -84,9 +84,10 @@ struct apbt_dev { int disable_apbt_percpu __cpuinitdata; +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); + #ifdef CONFIG_SMP static unsigned int apbt_num_timers_used; -static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); static struct apbt_dev *apbt_devs; #endif @@ -302,6 +303,7 @@ static void apbt_disable_int(int n) static int __init apbt_clockevent_register(void) { struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); if (mtmr == NULL) { @@ -329,22 +331,24 @@ static int __init apbt_clockevent_register(void) * global if not used for per cpu timer. */ apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + adev->num = smp_processor_id(); + memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); if (disable_apbt_percpu) { apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; - global_clock_event = &apbt_clockevent; + global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", global_clock_event->name); } if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - apbt_clockevent.name, &apbt_clockevent)) { + apbt_clockevent.name, adev)) { printk(KERN_ERR "Failed request IRQ for APBT%d\n", apbt_clockevent.irq); } - clockevents_register_device(&apbt_clockevent); + clockevents_register_device(&adev->evt); /* Start APBT 0 interrupts */ apbt_enable_int(APBT_CLOCKEVENT0_NUM); -- cgit v1.2.2 From c7bbf52aa4fa332b84c4f2bb33e69561ee6870b4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 3 Mar 2010 13:38:48 -0800 Subject: x86, mrst: Fix whitespace breakage in apb_timer.c Checkin bb24c4716185f6e116c440462c65c1f56649183b: "Moorestown APB system timer driver" suffered from severe whitespace damage in arch/x86/kernel/apb_timer.c due to using Microsoft Lookout to send a patch. Fix the whitespace breakage. Reported-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apb_timer.c | 1068 +++++++++++++++++++++---------------------- 1 file changed, 534 insertions(+), 534 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 6f27f8b75795..2afa27d01297 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -43,11 +43,11 @@ #include #include -#define APBT_MASK CLOCKSOURCE_MASK(32) -#define APBT_SHIFT 22 -#define APBT_CLOCKEVENT_RATING 150 -#define APBT_CLOCKSOURCE_RATING 250 -#define APBT_MIN_DELTA_USEC 200 +#define APBT_MASK CLOCKSOURCE_MASK(32) +#define APBT_SHIFT 22 +#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKSOURCE_RATING 250 +#define APBT_MIN_DELTA_USEC 200 #define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) #define APBT_CLOCKEVENT0_NUM (0) @@ -65,21 +65,21 @@ static int phy_cs_timer_id; static uint64_t apbt_freq; static void apbt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt); + struct clock_event_device *evt); static int apbt_next_event(unsigned long delta, - struct clock_event_device *evt); + struct clock_event_device *evt); static cycle_t apbt_read_clocksource(struct clocksource *cs); static void apbt_restart_clocksource(void); struct apbt_dev { - struct clock_event_device evt; - unsigned int num; - int cpu; - unsigned int irq; - unsigned int tick; - unsigned int count; - unsigned int flags; - char name[10]; + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int tick; + unsigned int count; + unsigned int flags; + char name[10]; }; int disable_apbt_percpu __cpuinitdata; @@ -91,77 +91,77 @@ static unsigned int apbt_num_timers_used; static struct apbt_dev *apbt_devs; #endif -static inline unsigned long apbt_readl_reg(unsigned long a) +static inline unsigned long apbt_readl_reg(unsigned long a) { - return readl(apbt_virt_address + a); + return readl(apbt_virt_address + a); } static inline void apbt_writel_reg(unsigned long d, unsigned long a) { - writel(d, apbt_virt_address + a); + writel(d, apbt_virt_address + a); } static inline unsigned long apbt_readl(int n, unsigned long a) { - return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); + return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); } static inline void apbt_writel(int n, unsigned long d, unsigned long a) { - writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); + writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); } static inline void apbt_set_mapping(void) { - struct sfi_timer_table_entry *mtmr; - - if (apbt_virt_address) { - pr_debug("APBT base already mapped\n"); - return; - } - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return; - } - apbt_address = (unsigned long)mtmr->phys_addr; - if (!apbt_address) { - printk(KERN_WARNING "No timer base from SFI, use default\n"); - apbt_address = APBT_DEFAULT_BASE; - } - apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); - if (apbt_virt_address) { - pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ - (void *)apbt_address, (void *)apbt_virt_address); - } else { - pr_debug("Failed mapping APBT phy address at %p\n",\ - (void *)apbt_address); - goto panic_noapbt; - } - apbt_freq = mtmr->freq_hz / USEC_PER_SEC; - sfi_free_mtmr(mtmr); - - /* Now figure out the physical timer id for clocksource device */ - mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); - if (mtmr == NULL) - goto panic_noapbt; - - /* Now figure out the physical timer id */ - phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) - / APBTMRS_REG_SIZE; - pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); - return; + struct sfi_timer_table_entry *mtmr; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (unsigned long)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (apbt_virt_address) { + pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ + (void *)apbt_address, (void *)apbt_virt_address); + } else { + pr_debug("Failed mapping APBT phy address at %p\n",\ + (void *)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) + / APBTMRS_REG_SIZE; + pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + return; panic_noapbt: - panic("Failed to setup APB system timer\n"); + panic("Failed to setup APB system timer\n"); } static inline void apbt_clear_mapping(void) { - iounmap(apbt_virt_address); - apbt_virt_address = NULL; + iounmap(apbt_virt_address); + apbt_virt_address = NULL; } /* @@ -169,28 +169,28 @@ static inline void apbt_clear_mapping(void) */ static inline int is_apbt_capable(void) { - return apbt_virt_address ? 1 : 0; + return apbt_virt_address ? 1 : 0; } static struct clocksource clocksource_apbt = { - .name = "apbt", - .rating = APBT_CLOCKSOURCE_RATING, - .read = apbt_read_clocksource, - .mask = APBT_MASK, - .shift = APBT_SHIFT, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .resume = apbt_restart_clocksource, + .name = "apbt", + .rating = APBT_CLOCKSOURCE_RATING, + .read = apbt_read_clocksource, + .mask = APBT_MASK, + .shift = APBT_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = apbt_restart_clocksource, }; /* boot APB clock event device */ static struct clock_event_device apbt_clockevent = { - .name = "apbt0", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = apbt_set_mode, - .set_next_event = apbt_next_event, - .shift = APBT_SHIFT, - .irq = 0, - .rating = APBT_CLOCKEVENT_RATING, + .name = "apbt0", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = apbt_set_mode, + .set_next_event = apbt_next_event, + .shift = APBT_SHIFT, + .irq = 0, + .rating = APBT_CLOCKEVENT_RATING, }; /* @@ -199,20 +199,20 @@ static struct clock_event_device apbt_clockevent = { */ static inline int __init setup_x86_mrst_timer(char *arg) { - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - disable_apbt_percpu = 0; - else if (strcmp("lapic_and_apbt", arg) == 0) - disable_apbt_percpu = 1; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + disable_apbt_percpu = 0; + else if (strcmp("lapic_and_apbt", arg) == 0) + disable_apbt_percpu = 1; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; } __setup("x86_mrst_timer=", setup_x86_mrst_timer); @@ -222,176 +222,176 @@ __setup("x86_mrst_timer=", setup_x86_mrst_timer); */ static void apbt_start_counter(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); - /* enable, mask interrupt */ - ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; - ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - /* read it once to get cached counter value initialized */ - apbt_read_clocksource(&clocksource_apbt); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); + /* enable, mask interrupt */ + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + /* read it once to get cached counter value initialized */ + apbt_read_clocksource(&clocksource_apbt); } static irqreturn_t apbt_interrupt_handler(int irq, void *data) { - struct apbt_dev *dev = (struct apbt_dev *)data; - struct clock_event_device *aevt = &dev->evt; - - if (!aevt->event_handler) { - printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", - dev->num); - return IRQ_NONE; - } - aevt->event_handler(aevt); - return IRQ_HANDLED; + struct apbt_dev *dev = (struct apbt_dev *)data; + struct clock_event_device *aevt = &dev->evt; + + if (!aevt->event_handler) { + printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", + dev->num); + return IRQ_NONE; + } + aevt->event_handler(aevt); + return IRQ_HANDLED; } static void apbt_restart_clocksource(void) { - apbt_start_counter(phy_cs_timer_id); + apbt_start_counter(phy_cs_timer_id); } /* Setup IRQ routing via IOAPIC */ #ifdef CONFIG_SMP static void apbt_setup_irq(struct apbt_dev *adev) { - struct irq_chip *chip; - struct irq_desc *desc; - - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - desc = irq_to_desc(adev->irq); - chip = get_irq_chip(adev->irq); - disable_irq(adev->irq); - desc->status |= IRQ_MOVE_PCNTXT; - irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); - /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ - set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); - enable_irq(adev->irq); - if (system_state == SYSTEM_BOOTING) - if (request_irq(adev->irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - adev->name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - adev->num); - } + struct irq_chip *chip; + struct irq_desc *desc; + + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + desc = irq_to_desc(adev->irq); + chip = get_irq_chip(adev->irq); + disable_irq(adev->irq); + desc->status |= IRQ_MOVE_PCNTXT; + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ + set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); + enable_irq(adev->irq); + if (system_state == SYSTEM_BOOTING) + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } } #endif static void apbt_enable_int(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - /* clear pending intr */ - apbt_readl(n, APBTMR_N_EOI); - ctrl &= ~APBTMR_CONTROL_INT; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + /* clear pending intr */ + apbt_readl(n, APBTMR_N_EOI); + ctrl &= ~APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); } static void apbt_disable_int(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - ctrl |= APBTMR_CONTROL_INT; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); } static int __init apbt_clockevent_register(void) { - struct sfi_timer_table_entry *mtmr; - struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); - - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return -ENODEV; - } - - /* - * We need to calculate the scaled math multiplication factor for - * nanosecond to apbt tick conversion. - * mult = (nsec/cycle)*2^APBT_SHIFT - */ - apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz - , NSEC_PER_SEC, APBT_SHIFT); - - /* Calculate the min / max delta */ - apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &apbt_clockevent); - apbt_clockevent.min_delta_ns = clockevent_delta2ns( - APBT_MIN_DELTA_USEC*apbt_freq, - &apbt_clockevent); - /* - * Start apbt with the boot cpu mask and make it - * global if not used for per cpu timer. - */ - apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); - adev->num = smp_processor_id(); - memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); - - if (disable_apbt_percpu) { - apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz + , NSEC_PER_SEC, APBT_SHIFT); + + /* Calculate the min / max delta */ + apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &apbt_clockevent); + apbt_clockevent.min_delta_ns = clockevent_delta2ns( + APBT_MIN_DELTA_USEC*apbt_freq, + &apbt_clockevent); + /* + * Start apbt with the boot cpu mask and make it + * global if not used for per cpu timer. + */ + apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + adev->num = smp_processor_id(); + memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); + + if (disable_apbt_percpu) { + apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; - printk(KERN_DEBUG "%s clockevent registered as global\n", - global_clock_event->name); - } - - if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - apbt_clockevent.name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - apbt_clockevent.irq); - } - - clockevents_register_device(&adev->evt); - /* Start APBT 0 interrupts */ - apbt_enable_int(APBT_CLOCKEVENT0_NUM); - - sfi_free_mtmr(mtmr); - return 0; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + apbt_clockevent.name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + apbt_clockevent.irq); + } + + clockevents_register_device(&adev->evt); + /* Start APBT 0 interrupts */ + apbt_enable_int(APBT_CLOCKEVENT0_NUM); + + sfi_free_mtmr(mtmr); + return 0; } #ifdef CONFIG_SMP /* Should be called with per cpu */ void apbt_setup_secondary_clock(void) { - struct apbt_dev *adev; - struct clock_event_device *aevt; - int cpu; - - /* Don't register boot CPU clockevent */ - cpu = smp_processor_id(); - if (cpu == boot_cpu_id) - return; - /* - * We need to calculate the scaled math multiplication factor for - * nanosecond to apbt tick conversion. - * mult = (nsec/cycle)*2^APBT_SHIFT - */ - printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); - adev = &per_cpu(cpu_apbt_dev, cpu); - aevt = &adev->evt; - - memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); - aevt->cpumask = cpumask_of(cpu); - aevt->name = adev->name; - aevt->mode = CLOCK_EVT_MODE_UNUSED; - - printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", - cpu, aevt->name, *(u32 *)aevt->cpumask); - - apbt_setup_irq(adev); - - clockevents_register_device(aevt); - - apbt_enable_int(cpu); - - return; + struct apbt_dev *adev; + struct clock_event_device *aevt; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (cpu == boot_cpu_id) + return; + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); + adev = &per_cpu(cpu_apbt_dev, cpu); + aevt = &adev->evt; + + memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); + aevt->cpumask = cpumask_of(cpu); + aevt->name = adev->name; + aevt->mode = CLOCK_EVT_MODE_UNUSED; + + printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", + cpu, aevt->name, *(u32 *)aevt->cpumask); + + apbt_setup_irq(adev); + + clockevents_register_device(aevt); + + apbt_enable_int(cpu); + + return; } /* @@ -405,34 +405,34 @@ void apbt_setup_secondary_clock(void) * the extra interrupt is harmless. */ static int apbt_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) + unsigned long action, void *hcpu) { - unsigned long cpu = (unsigned long)hcpu; - struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - - switch (action & 0xf) { - case CPU_DEAD: - apbt_disable_int(cpu); - if (system_state == SYSTEM_RUNNING) - pr_debug("skipping APBT CPU %lu offline\n", cpu); - else if (adev) { - pr_debug("APBT clockevent for cpu %lu offline\n", cpu); - free_irq(adev->irq, adev); - } - break; - default: - pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); - } - return NOTIFY_OK; + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + apbt_disable_int(cpu); + if (system_state == SYSTEM_RUNNING) + pr_debug("skipping APBT CPU %lu offline\n", cpu); + else if (adev) { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + free_irq(adev->irq, adev); + } + break; + default: + pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; } static __init int apbt_late_init(void) { - if (disable_apbt_percpu) - return 0; - /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(apbt_cpuhp_notify, -20); - return 0; + if (disable_apbt_percpu) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; } fs_initcall(apbt_late_init); #else @@ -442,93 +442,93 @@ void apbt_setup_secondary_clock(void) {} #endif /* CONFIG_SMP */ static void apbt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long ctrl; - uint64_t delta; - int timer_num; - struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); - - timer_num = adev->num; - pr_debug("%s CPU %d timer %d mode=%d\n", - __func__, first_cpu(*evt->cpumask), timer_num, mode); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; - delta >>= apbt_clockevent.shift; - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl |= APBTMR_CONTROL_MODE_PERIODIC; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* - * DW APB p. 46, have to disable timer before load counter, - * may cause sync problem. - */ - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - udelay(1); - pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); - apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - /* APB timer does not have one-shot mode, use free running mode */ - case CLOCK_EVT_MODE_ONESHOT: - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - /* - * set free running mode, this mode will let timer reload max - * timeout which will give time (3min on 25MHz clock) to rearm - * the next event, therefore emulate the one-shot mode. - */ - ctrl &= ~APBTMR_CONTROL_ENABLE; - ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; - - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* write again to set free running mode */ - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - - /* - * DW APB p. 46, load counter with all 1s before starting free - * running mode. - */ - apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); - ctrl &= ~APBTMR_CONTROL_INT; - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - apbt_disable_int(timer_num); - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - - case CLOCK_EVT_MODE_RESUME: - apbt_enable_int(timer_num); - break; - } + unsigned long ctrl; + uint64_t delta; + int timer_num; + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + pr_debug("%s CPU %d timer %d mode=%d\n", + __func__, first_cpu(*evt->cpumask), timer_num, mode); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; + delta >>= apbt_clockevent.shift; + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_MODE_PERIODIC; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* + * DW APB p. 46, have to disable timer before load counter, + * may cause sync problem. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + udelay(1); + pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + /* APB timer does not have one-shot mode, use free running mode */ + case CLOCK_EVT_MODE_ONESHOT: + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + /* + * set free running mode, this mode will let timer reload max + * timeout which will give time (3min on 25MHz clock) to rearm + * the next event, therefore emulate the one-shot mode. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write again to set free running mode */ + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + + /* + * DW APB p. 46, load counter with all 1s before starting free + * running mode. + */ + apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); + ctrl &= ~APBTMR_CONTROL_INT; + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + apbt_disable_int(timer_num); + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_RESUME: + apbt_enable_int(timer_num); + break; + } } static int apbt_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long ctrl; - int timer_num; - - struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); - - timer_num = adev->num; - /* Disable timer */ - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* write new count */ - apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - return 0; + unsigned long ctrl; + int timer_num; + + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + /* Disable timer */ + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write new count */ + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + return 0; } /* @@ -540,94 +540,94 @@ static int apbt_next_event(unsigned long delta, */ static cycle_t apbt_read_clocksource(struct clocksource *cs) { - unsigned long t0, t1, t2; - static unsigned long last_read; + unsigned long t0, t1, t2; + static unsigned long last_read; bad_count: - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if (unlikely(t1 < t2)) { - pr_debug("APBT: read current count error %lx:%lx:%lx\n", - t1, t2, t2 - t1); - goto bad_count; - } - /* - * check against cached last read, makes sure time does not go back. - * it could be a normal rollover but we will do tripple check anyway - */ - if (unlikely(t2 > last_read)) { - /* check if we have a normal rollover */ - unsigned long raw_intr_status = - apbt_readl_reg(APBTMRS_RAW_INT_STATUS); - /* - * cs timer interrupt is masked but raw intr bit is set if - * rollover occurs. then we read EOI reg to clear it. - */ - if (raw_intr_status & (1 << phy_cs_timer_id)) { - apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); - goto out; - } - pr_debug("APB CS going back %lx:%lx:%lx ", - t2, last_read, t2 - last_read); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if (unlikely(t1 < t2)) { + pr_debug("APBT: read current count error %lx:%lx:%lx\n", + t1, t2, t2 - t1); + goto bad_count; + } + /* + * check against cached last read, makes sure time does not go back. + * it could be a normal rollover but we will do tripple check anyway + */ + if (unlikely(t2 > last_read)) { + /* check if we have a normal rollover */ + unsigned long raw_intr_status = + apbt_readl_reg(APBTMRS_RAW_INT_STATUS); + /* + * cs timer interrupt is masked but raw intr bit is set if + * rollover occurs. then we read EOI reg to clear it. + */ + if (raw_intr_status & (1 << phy_cs_timer_id)) { + apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); + goto out; + } + pr_debug("APB CS going back %lx:%lx:%lx ", + t2, last_read, t2 - last_read); bad_count_x3: - pr_debug(KERN_INFO "tripple check enforced\n"); - t0 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if ((t2 > t1) || (t1 > t0)) { - printk(KERN_ERR "Error: APB CS tripple check failed\n"); - goto bad_count_x3; - } - } + pr_debug(KERN_INFO "tripple check enforced\n"); + t0 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if ((t2 > t1) || (t1 > t0)) { + printk(KERN_ERR "Error: APB CS tripple check failed\n"); + goto bad_count_x3; + } + } out: - last_read = t2; - return (cycle_t)~t2; + last_read = t2; + return (cycle_t)~t2; } static int apbt_clocksource_register(void) { - u64 start, now; - cycle_t t1; - - /* Start the counter, use timer 2 as source, timer 0/1 for event */ - apbt_start_counter(phy_cs_timer_id); - - /* Verify whether apbt counter works */ - t1 = apbt_read_clocksource(&clocksource_apbt); - rdtscll(start); - - /* - * We don't know the TSC frequency yet, but waiting for - * 200000 TSC cycles is safe: - * 4 GHz == 50us - * 1 GHz == 200us - */ - do { - rep_nop(); - rdtscll(now); - } while ((now - start) < 200000UL); - - /* APBT is the only always on clocksource, it has to work! */ - if (t1 == apbt_read_clocksource(&clocksource_apbt)) - panic("APBT counter not counting. APBT disabled\n"); - - /* - * initialize and register APBT clocksource - * convert that to ns/clock cycle - * mult = (ns/c) * 2^APBT_SHIFT - */ - clocksource_apbt.mult = div_sc(MSEC_PER_SEC, - (unsigned long) apbt_freq, APBT_SHIFT); - clocksource_register(&clocksource_apbt); - - return 0; + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + apbt_start_counter(phy_cs_timer_id); + + /* Verify whether apbt counter works */ + t1 = apbt_read_clocksource(&clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == apbt_read_clocksource(&clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + /* + * initialize and register APBT clocksource + * convert that to ns/clock cycle + * mult = (ns/c) * 2^APBT_SHIFT + */ + clocksource_apbt.mult = div_sc(MSEC_PER_SEC, + (unsigned long) apbt_freq, APBT_SHIFT); + clocksource_register(&clocksource_apbt); + + return 0; } /* @@ -640,145 +640,145 @@ static int apbt_clocksource_register(void) void __init apbt_time_init(void) { #ifdef CONFIG_SMP - int i; - struct sfi_timer_table_entry *p_mtmr; - unsigned int percpu_timer; - struct apbt_dev *adev; + int i; + struct sfi_timer_table_entry *p_mtmr; + unsigned int percpu_timer; + struct apbt_dev *adev; #endif - if (apb_timer_block_enabled) - return; - apbt_set_mapping(); - if (apbt_virt_address) { - pr_debug("Found APBT version 0x%lx\n",\ - apbt_readl_reg(APBTMRS_COMP_VERSION)); - } else - goto out_noapbt; - /* - * Read the frequency and check for a sane value, for ESL model - * we extend the possible clock range to allow time scaling. - */ - - if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { - pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); - goto out_noapbt; - } - if (apbt_clocksource_register()) { - pr_debug("APBT has failed to register clocksource\n"); - goto out_noapbt; - } - if (!apbt_clockevent_register()) - apb_timer_block_enabled = 1; - else { - pr_debug("APBT has failed to register clockevent\n"); - goto out_noapbt; - } + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (apbt_virt_address) { + pr_debug("Found APBT version 0x%lx\n",\ + apbt_readl_reg(APBTMRS_COMP_VERSION)); + } else + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } #ifdef CONFIG_SMP - /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (disable_apbt_percpu) { - printk(KERN_INFO "apbt: disabled per cpu timer\n"); - return; - } - pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); - if (num_possible_cpus() <= sfi_mtimer_num) { - percpu_timer = 1; - apbt_num_timers_used = num_possible_cpus(); - } else { - percpu_timer = 0; - apbt_num_timers_used = 1; - adev = &per_cpu(cpu_apbt_dev, 0); - adev->flags &= ~APBT_DEV_USED; - } - pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); - - /* here we set up per CPU timer data structure */ - apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, - GFP_KERNEL); - if (!apbt_devs) { - printk(KERN_ERR "Failed to allocate APB timer devices\n"); - return; - } - for (i = 0; i < apbt_num_timers_used; i++) { - adev = &per_cpu(cpu_apbt_dev, i); - adev->num = i; - adev->cpu = i; - p_mtmr = sfi_get_mtmr(i); - if (p_mtmr) { - adev->tick = p_mtmr->freq_hz; - adev->irq = p_mtmr->irq; - } else - printk(KERN_ERR "Failed to get timer for cpu %d\n", i); - adev->count = 0; - sprintf(adev->name, "apbt%d", i); - } + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (disable_apbt_percpu) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) { + percpu_timer = 1; + apbt_num_timers_used = num_possible_cpus(); + } else { + percpu_timer = 0; + apbt_num_timers_used = 1; + adev = &per_cpu(cpu_apbt_dev, 0); + adev->flags &= ~APBT_DEV_USED; + } + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, + GFP_KERNEL); + if (!apbt_devs) { + printk(KERN_ERR "Failed to allocate APB timer devices\n"); + return; + } + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) { + adev->tick = p_mtmr->freq_hz; + adev->irq = p_mtmr->irq; + } else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + adev->count = 0; + sprintf(adev->name, "apbt%d", i); + } #endif - return; + return; out_noapbt: - apbt_clear_mapping(); - apb_timer_block_enabled = 0; - panic("failed to enable APB timer\n"); + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); } static inline void apbt_disable(int n) { - if (is_apbt_capable()) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - } + if (is_apbt_capable()) { + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + } } /* called before apb_timer_enable, use early map */ unsigned long apbt_quick_calibrate() { - int i, scale; - u64 old, new; - cycle_t t1, t2; - unsigned long khz = 0; - u32 loop, shift; - - apbt_set_mapping(); - apbt_start_counter(phy_cs_timer_id); - - /* check if the timer can count down, otherwise return */ - old = apbt_read_clocksource(&clocksource_apbt); - i = 10000; - while (--i) { - if (old != apbt_read_clocksource(&clocksource_apbt)) - break; - } - if (!i) - goto failed; - - /* count 16 ms */ - loop = (apbt_freq * 1000) << 4; - - /* restart the timer to ensure it won't get to 0 in the calibration */ - apbt_start_counter(phy_cs_timer_id); - - old = apbt_read_clocksource(&clocksource_apbt); - old += loop; - - t1 = __native_read_tsc(); - - do { - new = apbt_read_clocksource(&clocksource_apbt); - } while (new < old); - - t2 = __native_read_tsc(); - - shift = 5; - if (unlikely(loop >> shift == 0)) { - printk(KERN_INFO - "APBT TSC calibration failed, not enough resolution\n"); - return 0; - } - scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * apbt_freq * 1000) >> shift; - printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); - return khz; + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + apbt_start_counter(phy_cs_timer_id); + + /* check if the timer can count down, otherwise return */ + old = apbt_read_clocksource(&clocksource_apbt); + i = 10000; + while (--i) { + if (old != apbt_read_clocksource(&clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq * 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + apbt_start_counter(phy_cs_timer_id); + + old = apbt_read_clocksource(&clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = apbt_read_clocksource(&clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * apbt_freq * 1000) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; failed: - return 0; + return 0; } -- cgit v1.2.2 From e5a11016643d1ab7172193591506d33a844734cc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 3 Mar 2010 22:38:50 -0500 Subject: x86: Issue at least one memory barrier in stop_machine_text_poke() Fix stop_machine_text_poke() to issue smp_mb() before exiting waiting loop, and use cpu_relax() for waiting. Changes in v2: - Don't use ACCESS_ONCE(). Signed-off-by: Masami Hiramatsu Acked-by: Mathieu Desnoyers Cc: systemtap Cc: DLE Cc: Jason Baron LKML-Reference: <20100304033850.3819.74590.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c41f13c15e8f..e0b877099470 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -595,8 +595,8 @@ static int __kprobes stop_machine_text_poke(void *data) wrote_text = 1; } else { while (!wrote_text) - smp_rmb(); - sync_core(); + cpu_relax(); + smp_mb(); /* Load wrote_text before following execution */ } flush_icache_range((unsigned long)tpp->addr, -- cgit v1.2.2 From 6c550ee41596798cbd873d3df9f8ea0a4ce7ad2f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 5 Mar 2010 09:52:52 -0800 Subject: x86: fix mtrr missing kernel-doc Fix missing kernel-doc notation in mtrr/main.c: Warning(arch/x86/kernel/cpu/mtrr/main.c:152): No description found for parameter 'info' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index fe4622e8c837..79556bd9b602 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -145,6 +145,7 @@ struct set_mtrr_data { /** * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * @info: pointer to mtrr configuration data * * Returns nothing. */ -- cgit v1.2.2 From 984b3f5746ed2cde3d184651dabf26980f2b66e5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 5 Mar 2010 13:41:37 -0800 Subject: bitops: rename for_each_bit() to for_each_set_bit() Rename for_each_bit to for_each_set_bit in the kernel source tree. To permit for_each_clear_bit(), should that ever be added. The patch includes a macro to map the old for_each_bit() onto the new for_each_set_bit(). This is a (very) temporary thing to ease the migration. [akpm@linux-foundation.org: add temporary for_each_bit()] Suggested-by: Alexey Dobriyan Suggested-by: Andrew Morton Signed-off-by: Akinobu Mita Cc: "David S. Miller" Cc: Russell King Cc: David Woodhouse Cc: Artem Bityutskiy Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9dddbc..b1fbdeecf6c9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -676,7 +676,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (c->weight != w) continue; - for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590cf4a5f..977e7544738c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -757,7 +757,7 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); -- cgit v1.2.2 From 52cf25d0ab7f78eeecc59ac652ed5090f69b619e Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Tue, 19 Jan 2010 02:58:23 +0100 Subject: Driver core: Constify struct sysfs_ops in struct kobj_type Constify struct sysfs_ops. This is part of the ops structure constification effort started by Arjan van de Ven et al. Benefits of this constification: * prevents modification of data that is shared (referenced) by many other structure instances at runtime * detects/prevents accidental (but not intentional) modification attempts on archs that enforce read-only kernel data at runtime * potentially better optimized code as the compiler can assume that the const data cannot be changed * the compiler/linker move const data into .rodata and therefore exclude them from false sharing Signed-off-by: Emese Revfy Acked-by: David Teigland Acked-by: Matt Domsch Acked-by: Maciej Sosnowski Acked-by: Hans J. Koch Acked-by: Pekka Enberg Acked-by: Jens Axboe Acked-by: Stephen Hemminger Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index eddb1bdd1b8f..b3eeb66c0a51 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -903,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = { .show = show, .store = store, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 83a3d1f4efca..cda932ca3ade 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -388,7 +388,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops threshold_ops = { +static const struct sysfs_ops threshold_ops = { .show = show, .store = store, }; -- cgit v1.2.2 From a07e4156a2ee6359d31a44946d7ee7f85dbf6bca Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 11 Feb 2010 15:23:05 -0800 Subject: sysfs: Use sysfs_attr_init and sysfs_bin_attr_init on dynamic attributes These are the non-static sysfs attributes that exist on my test machine. Fix them to use sysfs_attr_init or sysfs_bin_attr_init as appropriate. It simply requires making a sysfs attribute present to see this. So this is a little bit tedious but otherwise not too bad. Signed-off-by: Eric W. Biederman Acked-by: WANG Cong Cc: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a8aacd4b513c..28cba46bf32c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2044,6 +2044,7 @@ static __init void mce_init_banks(void) struct mce_bank *b = &mce_banks[i]; struct sysdev_attribute *a = &b->attr; + sysfs_attr_init(&a->attr); a->attr.name = b->attrname; snprintf(b->attrname, ATTR_LEN, "bank%d", i); -- cgit v1.2.2 From 8b408fe4f853dcfa18d133aa4cf1d7546b4c3870 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Mar 2010 14:20:07 +0100 Subject: x86/amd-iommu: Use helper function to destroy domain In the amd_iommu_domain_destroy the protection_domain_free function is partly reimplemented. The 'partly' is the bug here because the domain is not deleted from the domain list. This results in use-after-free errors and data-corruption. Fix it by just using protection_domain_free instead. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0c0425436a73..b06f29e275e9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2380,9 +2380,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) free_pagetable(domain); - domain_id_free(domain->id); - - kfree(domain); + protection_domain_free(domain); dom->priv = NULL; } -- cgit v1.2.2 From dc1d628a67a8f042e711ea5accc0beedc3ef0092 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Mar 2010 15:55:04 +0100 Subject: perf: Provide generic perf_sample_data initialization This makes it easier to extend perf_sample_data and fixes a bug on arm and sparc, which failed to set ->raw to NULL, which can cause crashes when combined with PERF_SAMPLE_RAW. It also optimizes PowerPC and tracepoint, because the struct initialization is forced to zero out the whole structure. Signed-off-by: Peter Zijlstra Acked-by: Jean Pihet Reviewed-by: Frederic Weisbecker Acked-by: David S. Miller Cc: Jamie Iles Cc: Paul Mackerras Cc: Stephane Eranian Cc: stable@kernel.org LKML-Reference: <20100304140100.315416040@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +-- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 97cddbf32936..42aafd11e170 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1097,8 +1097,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 73102df8bfc1..44b60c852107 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -590,10 +590,9 @@ static void intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; regs.ip = 0; /* @@ -742,8 +741,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 ack, status; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); -- cgit v1.2.2 From 3f6da3905398826d85731247e7fbcf53400c18bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Mar 2010 13:01:18 +0100 Subject: perf: Rework and fix the arch CPU-hotplug hooks Remove the hw_perf_event_*() hotplug hooks in favour of per PMU hotplug notifiers. This has the advantage of reducing the static weak interface as well as exposing all hotplug actions to the PMU. Use this to fix x86 hotplug usage where we did things in ONLINE which should have been done in UP_PREPARE or STARTING. Signed-off-by: Peter Zijlstra Cc: Paul Mundt Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo LKML-Reference: <20100305154128.736225361@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 70 ++++++++++++++++++++-------------- arch/x86/kernel/cpu/perf_event_amd.c | 60 ++++++++++++----------------- arch/x86/kernel/cpu/perf_event_intel.c | 5 ++- 3 files changed, 71 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 42aafd11e170..585d5608ae6b 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -157,6 +157,11 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; + + void (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); }; static struct x86_pmu x86_pmu __read_mostly; @@ -293,7 +298,7 @@ static inline bool bts_available(void) return x86_pmu.enable_bts != NULL; } -static inline void init_debug_store_on_cpu(int cpu) +static void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -305,7 +310,7 @@ static inline void init_debug_store_on_cpu(int cpu) (u32)((u64)(unsigned long)ds >> 32)); } -static inline void fini_debug_store_on_cpu(int cpu) +static void fini_debug_store_on_cpu(int cpu) { if (!per_cpu(cpu_hw_events, cpu).ds) return; @@ -1337,6 +1342,39 @@ undo: #include "perf_event_p6.c" #include "perf_event_intel.c" +static int __cpuinit +x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + if (x86_pmu.cpu_prepare) + x86_pmu.cpu_prepare(cpu); + break; + + case CPU_STARTING: + if (x86_pmu.cpu_starting) + x86_pmu.cpu_starting(cpu); + break; + + case CPU_DYING: + if (x86_pmu.cpu_dying) + x86_pmu.cpu_dying(cpu); + break; + + case CPU_DEAD: + if (x86_pmu.cpu_dead) + x86_pmu.cpu_dead(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + static void __init pmu_check_apic(void) { if (cpu_has_apic) @@ -1415,6 +1453,8 @@ void __init init_hw_perf_events(void) pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); pr_info("... event mask: %016Lx\n", perf_event_mask); + + perf_cpu_notifier(x86_pmu_notifier); } static inline void x86_pmu_read(struct perf_event *event) @@ -1674,29 +1714,3 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } - -void hw_perf_event_setup_online(int cpu) -{ - init_debug_store_on_cpu(cpu); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - amd_pmu_cpu_online(cpu); - break; - default: - return; - } -} - -void hw_perf_event_setup_offline(int cpu) -{ - init_debug_store_on_cpu(cpu); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - amd_pmu_cpu_offline(cpu); - break; - default: - return; - } -} diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 8f3dbfda3c4f..014528ba7d57 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -271,28 +271,6 @@ done: return &emptyconstraint; } -static __initconst struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints -}; - static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) { struct amd_nb *nb; @@ -378,6 +356,31 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_unlock(&amd_nb_lock); } +static __initconst struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints, + + .cpu_prepare = amd_pmu_cpu_online, + .cpu_dead = amd_pmu_cpu_offline, +}; + static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ @@ -390,11 +393,6 @@ static __init int amd_pmu_init(void) memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - /* - * explicitly initialize the boot cpu, other cpus will get - * the cpu hotplug callbacks from smp_init() - */ - amd_pmu_cpu_online(smp_processor_id()); return 0; } @@ -405,12 +403,4 @@ static int amd_pmu_init(void) return 0; } -static void amd_pmu_cpu_online(int cpu) -{ -} - -static void amd_pmu_cpu_offline(int cpu) -{ -} - #endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 44b60c852107..12e811a7d747 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -870,7 +870,10 @@ static __initconst struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, - .get_event_constraints = intel_get_event_constraints + .get_event_constraints = intel_get_event_constraints, + + .cpu_starting = init_debug_store_on_cpu, + .cpu_dying = fini_debug_store_on_cpu, }; static __init int intel_pmu_init(void) -- cgit v1.2.2 From 3fb2b8ddcc6a7aa62af6bd2cb939edfd4c460506 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 Mar 2010 13:51:01 +0100 Subject: perf, x86, Do not user perf_disable from NMI context Explicitly use intel_pmu_{disable,enable}_all() in intel_pmu_handle_irq() to avoid the NMI race conditions in perf_{disable,enable} Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 12e811a7d747..c582449163fa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -745,11 +745,11 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); - perf_disable(); + intel_pmu_disable_all(); intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); if (!status) { - perf_enable(); + intel_pmu_enable_all(); return 0; } @@ -759,8 +759,7 @@ again: WARN_ONCE(1, "perfevents: irq loop stuck!\n"); perf_event_print_debug(); intel_pmu_reset(); - perf_enable(); - return 1; + goto done; } inc_irq_stat(apic_perf_irqs); @@ -790,8 +789,8 @@ again: if (status) goto again; - perf_enable(); - +done: + intel_pmu_enable_all(); return 1; } -- cgit v1.2.2 From 07088edb88164c2a2406cd2d9a7be19d8515214b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 20:16:01 +0100 Subject: perf, x86: Remove superfluous arguments to x86_perf_event_set_period() The second and third argument to x86_perf_event_set_period() are superfluous since they are simple expressions of the first argument. Hence remove them. Signed-off-by: Peter Zijlstra Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo LKML-Reference: <20100304140100.006500906@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 15 +++++++-------- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 585d5608ae6b..fcf1788f9626 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -170,8 +170,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static int x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx); +static int x86_perf_event_set_period(struct perf_event *event); /* * Generalized hw caching related hw_event table, filled @@ -835,7 +834,7 @@ void hw_perf_enable(void) if (hwc->idx == -1) { x86_assign_hw_event(event, cpuc, i); - x86_perf_event_set_period(event, hwc, hwc->idx); + x86_perf_event_set_period(event); } /* * need to mark as active because x86_pmu_disable() @@ -876,12 +875,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); * To be called with the event disabled in hw: */ static int -x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; - int err, ret = 0; + int err, ret = 0, idx = hwc->idx; if (idx == X86_PMC_IDX_FIXED_BTS) return 0; @@ -979,7 +978,7 @@ static int x86_pmu_start(struct perf_event *event) if (hwc->idx == -1) return -EAGAIN; - x86_perf_event_set_period(event, hwc, hwc->idx); + x86_perf_event_set_period(event); x86_pmu.enable(hwc, hwc->idx); return 0; @@ -1123,7 +1122,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) handled = 1; data.period = event->hw.last_period; - if (!x86_perf_event_set_period(event, hwc, idx)) + if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c582449163fa..6dbdf91ab342 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -699,7 +699,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event) int ret; x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); + ret = x86_perf_event_set_period(event); return ret; } -- cgit v1.2.2 From cc2ad4ba8792b9d4ff893ae3b845d2c5a6206fc9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 20:18:39 +0100 Subject: perf, x86: Remove superfluous arguments to x86_perf_event_update() The second and third argument to x86_perf_event_update() are superfluous since they are simple expressions of the first argument. Hence remove them. Signed-off-by: Peter Zijlstra Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo LKML-Reference: <20100304140100.089468871@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 11 ++++++----- arch/x86/kernel/cpu/perf_event_intel.c | 10 ++-------- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fcf1788f9626..086127ba580f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -193,11 +193,12 @@ static u64 __read_mostly hw_cache_event_ids * Returns the delta events processed. */ static u64 -x86_perf_event_update(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_update(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; + int idx = hwc->idx; s64 delta; if (idx == X86_PMC_IDX_FIXED_BTS) @@ -1064,7 +1065,7 @@ static void x86_pmu_stop(struct perf_event *event) * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event, hwc, idx); + x86_perf_event_update(event); cpuc->events[idx] = NULL; } @@ -1112,7 +1113,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; hwc = &event->hw; - val = x86_perf_event_update(event, hwc, idx); + val = x86_perf_event_update(event); if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; @@ -1458,7 +1459,7 @@ void __init init_hw_perf_events(void) static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event, &event->hw, event->hw.idx); + x86_perf_event_update(event); } static const struct pmu pmu = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6dbdf91ab342..a4c9f160448e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -694,14 +694,8 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) */ static int intel_pmu_save_and_restart(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event); - - return ret; + x86_perf_event_update(event); + return x86_perf_event_set_period(event); } static void intel_pmu_reset(void) -- cgit v1.2.2 From aff3d91a913c9ae0c2f56b65b27cbd00c7d27ee3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 20:32:08 +0100 Subject: perf, x86: Change x86_pmu.{enable,disable} calling convention Pass the full perf_event into the x86_pmu functions so that those may make use of more than the hw_perf_event, and while doing this, remove the superfluous second argument. Signed-off-by: Peter Zijlstra Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo LKML-Reference: <20100304140100.165166129@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 31 +++++++++++++++---------------- arch/x86/kernel/cpu/perf_event_intel.c | 30 +++++++++++++++++------------- arch/x86/kernel/cpu/perf_event_p6.c | 10 ++++++---- 3 files changed, 38 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 086127ba580f..2dd704fa1299 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -133,8 +133,8 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_event *, int); - void (*disable)(struct hw_perf_event *, int); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -845,7 +845,7 @@ void hw_perf_enable(void) set_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = event; - x86_pmu.enable(hwc, hwc->idx); + x86_pmu.enable(event); perf_event_update_userpage(event); } cpuc->n_added = 0; @@ -858,15 +858,16 @@ void hw_perf_enable(void) x86_pmu.enable_all(); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) { - (void)checking_wrmsrl(hwc->config_base + idx, + (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); } -static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline void x86_pmu_disable_event(struct perf_event *event) { - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); + struct hw_perf_event *hwc = &event->hw; + (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -927,11 +928,11 @@ x86_perf_event_set_period(struct perf_event *event) return ret; } -static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) - __x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(&event->hw); } /* @@ -974,13 +975,11 @@ static int x86_pmu_enable(struct perf_event *event) static int x86_pmu_start(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; - - if (hwc->idx == -1) + if (event->hw.idx == -1) return -EAGAIN; x86_perf_event_set_period(event); - x86_pmu.enable(hwc, hwc->idx); + x86_pmu.enable(event); return 0; } @@ -994,7 +993,7 @@ static void x86_pmu_unthrottle(struct perf_event *event) cpuc->events[hwc->idx] != event)) return; - x86_pmu.enable(hwc, hwc->idx); + x86_pmu.enable(event); } void perf_event_print_debug(void) @@ -1059,7 +1058,7 @@ static void x86_pmu_stop(struct perf_event *event) * could reenable again: */ clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); + x86_pmu.disable(event); /* * Drain the remaining delta count out of a event @@ -1127,7 +1126,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu.disable(hwc, idx); + x86_pmu.disable(event); } if (handled) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a4c9f160448e..a84094897799 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -548,9 +548,9 @@ static inline void intel_pmu_ack_status(u64 ack) } static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = __idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -621,26 +621,28 @@ static void intel_pmu_drain_bts_buffer(void) } static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +intel_pmu_disable_event(struct perf_event *event) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; } if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); + intel_pmu_disable_fixed(hwc); return; } - x86_pmu_disable_event(hwc, idx); + x86_pmu_disable_event(event); } static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = __idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; int err; @@ -670,9 +672,11 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void intel_pmu_enable_event(struct perf_event *event) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { if (!__get_cpu_var(cpu_hw_events).enabled) return; @@ -681,11 +685,11 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) } if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); + intel_pmu_enable_fixed(hwc); return; } - __x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(hwc); } /* @@ -771,7 +775,7 @@ again: data.period = event->hw.last_period; if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); + intel_pmu_disable_event(event); } intel_pmu_ack_status(ack); diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index a4e67b99d91c..a330485d14da 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -77,27 +77,29 @@ static void p6_pmu_enable_all(void) } static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +p6_pmu_disable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + idx, val); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); } -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void p6_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + idx, val); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); } static __initconst struct x86_pmu p6_pmu = { -- cgit v1.2.2 From 34538ee77b39a12702e0f4c3ed9e8fa2dd5eb92c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 21:16:55 +0100 Subject: perf, x86: Use unlocked bitops There is no concurrency on these variables, so don't use LOCK'ed ops. As to the intel_pmu_handle_irq() status bit clean, nobody uses that so remove it all together. Signed-off-by: Peter Zijlstra Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo LKML-Reference: <20100304140100.240023029@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 8 ++++---- arch/x86/kernel/cpu/perf_event_amd.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2dd704fa1299..01b166737424 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -643,7 +643,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (test_bit(hwc->idx, used_mask)) break; - set_bit(hwc->idx, used_mask); + __set_bit(hwc->idx, used_mask); if (assign) assign[i] = hwc->idx; } @@ -692,7 +692,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; - set_bit(j, used_mask); + __set_bit(j, used_mask); if (assign) assign[i] = j; @@ -842,7 +842,7 @@ void hw_perf_enable(void) * clear active_mask and events[] yet it preserves * idx */ - set_bit(hwc->idx, cpuc->active_mask); + __set_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = event; x86_pmu.enable(event); @@ -1057,7 +1057,7 @@ static void x86_pmu_stop(struct perf_event *event) * Must be done before we disable, otherwise the nmi handler * could reenable again: */ - clear_bit(idx, cpuc->active_mask); + __clear_bit(idx, cpuc->active_mask); x86_pmu.disable(event); /* diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 014528ba7d57..573458f1caf2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -287,7 +287,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) * initialize all possible NB constraints */ for (i = 0; i < x86_pmu.num_events; i++) { - set_bit(i, nb->event_constraints[i].idxmsk); + __set_bit(i, nb->event_constraints[i].idxmsk); nb->event_constraints[i].weight = 1; } return nb; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a84094897799..d87421c3f55b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -765,7 +765,6 @@ again: for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; - clear_bit(bit, (unsigned long *) &status); if (!test_bit(bit, cpuc->active_mask)) continue; -- cgit v1.2.2 From c08053e627d23490a03431285b78b7a5b617fbad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 Mar 2010 13:19:24 +0100 Subject: perf, x86: Fix x86_pmu_start pmu::start should undo pmu::stop, make it so. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 01b166737424..9757b96f15f5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -785,6 +785,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, hwc->last_tag == cpuc->tags[i]; } +static int x86_pmu_start(struct perf_event *event); static void x86_pmu_stop(struct perf_event *event); void hw_perf_enable(void) @@ -833,20 +834,10 @@ void hw_perf_enable(void) event = cpuc->event_list[i]; hwc = &event->hw; - if (hwc->idx == -1) { + if (hwc->idx == -1) x86_assign_hw_event(event, cpuc, i); - x86_perf_event_set_period(event); - } - /* - * need to mark as active because x86_pmu_disable() - * clear active_mask and events[] yet it preserves - * idx - */ - __set_bit(hwc->idx, cpuc->active_mask); - cpuc->events[hwc->idx] = event; - x86_pmu.enable(event); - perf_event_update_userpage(event); + x86_pmu_start(event); } cpuc->n_added = 0; perf_events_lapic_init(); @@ -975,11 +966,17 @@ static int x86_pmu_enable(struct perf_event *event) static int x86_pmu_start(struct perf_event *event) { - if (event->hw.idx == -1) + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx = event->hw.idx; + + if (idx == -1) return -EAGAIN; x86_perf_event_set_period(event); + cpuc->events[idx] = event; + __set_bit(idx, cpuc->active_mask); x86_pmu.enable(event); + perf_event_update_userpage(event); return 0; } -- cgit v1.2.2 From 71e2d2828046133ed985696a02e2e1499ca0bfb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 Mar 2010 17:51:33 +0100 Subject: perf, x86: Avoid double disable on throttle vs ioctl(PERF_IOC_DISABLE) Calling ioctl(PERF_EVENT_IOC_DISABLE) on a thottled counter would result in a double disable, cure this by using x86_pmu_{start,stop} for throttle/unthrottle and teach x86_pmu_stop() to check ->active_mask. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 20 ++++++-------------- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 2 files changed, 7 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9757b96f15f5..b68c4fb7a944 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -983,14 +983,8 @@ static int x86_pmu_start(struct perf_event *event) static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->events[hwc->idx] != event)) - return; - - x86_pmu.enable(event); + int ret = x86_pmu_start(event); + WARN_ON_ONCE(ret); } void perf_event_print_debug(void) @@ -1050,11 +1044,9 @@ static void x86_pmu_stop(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - __clear_bit(idx, cpuc->active_mask); + if (!__test_and_clear_bit(idx, cpuc->active_mask)) + return; + x86_pmu.disable(event); /* @@ -1123,7 +1115,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu.disable(event); + x86_pmu_stop(event); } if (handled) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d87421c3f55b..84bfde64a337 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -774,7 +774,7 @@ again: data.period = event->hw.last_period; if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(event); + x86_pmu_stop(event); } intel_pmu_ack_status(ack); -- cgit v1.2.2 From 356e1f2e0ace2d4b100c8eda9d49b709e8323da5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 Mar 2010 13:49:56 +0100 Subject: perf, x86: Properly account n_added Make sure n_added is properly accounted so that we can rely on the value to reflect the number of added counters. This is needed if its going to be used for more than a boolean check. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b68c4fb7a944..071c8405debd 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -959,7 +959,7 @@ static int x86_pmu_enable(struct perf_event *event) memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->n_events = n; - cpuc->n_added = n - n0; + cpuc->n_added += n - n0; return 0; } @@ -1302,7 +1302,7 @@ int hw_perf_group_sched_in(struct perf_event *leader, memcpy(cpuc->assign, assign, n0*sizeof(int)); cpuc->n_events = n0; - cpuc->n_added = n1; + cpuc->n_added += n1; ctx->nr_active += n1; /* -- cgit v1.2.2 From 19925ce778f9fc371b9607625de3bff04c60121e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 Mar 2010 13:20:40 +0100 Subject: perf, x86: Fix double disable calls hw_perf_enable() would disable events that were not yet enabled. This causes problems with code that assumes that ->enable/->disable calls are balanced (like the LBR code does). What happens is that we disable newly added counters that match their previous assignment, even though they are not yet programmed on the hardware. Avoid this by only doing the first pass over the existing events. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 071c8405debd..045cc0bb4c17 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -802,6 +802,7 @@ void hw_perf_enable(void) return; if (cpuc->n_added) { + int n_running = cpuc->n_events - cpuc->n_added; /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() @@ -809,7 +810,7 @@ void hw_perf_enable(void) * step1: save events moving to new counters * step2: reprogram moved events into new counters */ - for (i = 0; i < cpuc->n_events; i++) { + for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; -- cgit v1.2.2 From f3d46b2e6fa57547f9884330798792afc83f4b04 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 Mar 2010 13:24:58 +0100 Subject: perf, x86: Fix double enable calls hw_perf_enable() would enable already enabled events. This causes problems with code that assumes that ->enable/->disable calls are balanced (like the LBR code does). What happens is that events that were already running and left in place would get enabled again. Avoid this by only enabling new events that match their previous assignment. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 045cc0bb4c17..1d665a0b202c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -835,6 +835,10 @@ void hw_perf_enable(void) event = cpuc->event_list[i]; hwc = &event->hw; + if (i < n_running && + match_prev_assignment(hwc, cpuc, i)) + continue; + if (hwc->idx == -1) x86_assign_hw_event(event, cpuc, i); -- cgit v1.2.2 From 61e67fb9d3ed13e6a7f58652ae4979b9c872fa57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Mar 2010 07:38:37 +0100 Subject: perf/x86-64: Use frame pointer to walk on irq and process stacks We were using the frame pointer based stack walker on every contexts in x86-32, but not in x86-64 where we only use the seven-league boots on the exception stacks. Use it also on irq and process stacks. This utterly accelerate the captures. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo --- arch/x86/kernel/dumpstack_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d5e2a2ebb627..272c9f1f05f3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -208,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; - bp = print_context_stack(tinfo, stack, bp, + bp = ops->walk_stack(tinfo, stack, bp, ops, data, irq_stack_end, &graph); /* * We link to the next stack (which would be @@ -229,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -- cgit v1.2.2 From 5331d7b84613b8325362dde53dc2bff2fb87d351 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Mar 2010 21:15:56 +0100 Subject: perf: Introduce new perf_fetch_caller_regs() for hot regs snapshot Events that trigger overflows by interrupting a context can use get_irq_regs() or task_pt_regs() to retrieve the state when the event triggered. But this is not the case for some other class of events like trace events as tracepoints are executed in the same context than the code that triggered the event. It means we need a different api to capture the regs there, namely we need a hot snapshot to get the most important informations for perf: the instruction pointer to get the event origin, the frame pointer for the callchain, the code segment for user_mode() tests (we always use __KERNEL_CS as trace events always occur from the kernel) and the eflags for further purposes. v2: rename perf_save_regs to perf_fetch_caller_regs as per Masami's suggestion. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Jason Baron Cc: Archs --- arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++++ arch/x86/kernel/dumpstack.h | 15 +++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1d665a0b202c..c6bde7d7afdc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1707,3 +1707,15 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } + +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) +{ + regs->ip = ip; + /* + * perf_arch_fetch_caller_regs adds another call, we need to increment + * the skip level + */ + regs->bp = rewind_frame_pointer(skip + 1); + regs->cs = __KERNEL_CS; + local_save_flags(regs->flags); +} diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 4fd1420faffa..29e5f7c845b2 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,4 +29,19 @@ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; + +static inline unsigned long rewind_frame_pointer(int n) +{ + struct stack_frame *frame; + + get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER + while (n--) + frame = frame->next_frame; #endif + + return (unsigned long)frame; +} + +#endif /* DUMPSTACK_H */ -- cgit v1.2.2 From f56e8a0765cc4374e02f4e3a79e2427b5096b075 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Mar 2010 15:03:27 -0800 Subject: x86/mce: Fix RCU lockdep splats Create an rcu_dereference_check_mce() that checks for RCU-sched read side and mce_read_mutex being held on update side. Replace uses of rcu_dereference() in arch/x86/kernel/cpu/mcheck/mce.c with this new macro. Signed-off-by: Paul E. McKenney Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267830207-9474-3-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a8aacd4b513c..4442e9e898c2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,11 @@ #include "mce-internal.h" +#define rcu_dereference_check_mce(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&mce_read_mutex)) + #define CREATE_TRACE_POINTS #include @@ -158,7 +163,7 @@ void mce_log(struct mce *mce) mce->finished = 0; wmb(); for (;;) { - entry = rcu_dereference(mcelog.next); + entry = rcu_dereference_check_mce(mcelog.next); for (;;) { /* * When the buffer fills up discard new entries. @@ -1500,7 +1505,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, return -ENOMEM; mutex_lock(&mce_read_mutex); - next = rcu_dereference(mcelog.next); + next = rcu_dereference_check_mce(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { @@ -1565,7 +1570,7 @@ timeout: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) + if (rcu_dereference_check_mce(mcelog.next)) return POLLIN | POLLRDNORM; return 0; } -- cgit v1.2.2 From 10fb7f1f2d311b4d2e5d881fe2d83f1c281100f9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 5 Mar 2010 13:10:36 -0600 Subject: x86: Reduce per cpu MCA boot up messages Don't write per cpu MCA boot up messages. Signed-of-by: Mike Travis Cc: Hidetoshi Seto Cc: x86@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 7c785634af2b..d15df6e49bf0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -95,7 +95,7 @@ static void cmci_discover(int banks, int boot) /* Already owned by someone else? */ if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) + if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; @@ -107,7 +107,7 @@ static void cmci_discover(int banks, int boot) /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) + if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); } else { -- cgit v1.2.2 From d6dd692168c049196f54edc2e8227c60702bb1d2 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 5 Mar 2010 13:10:38 -0600 Subject: x86: Reduce per cpu warning boot up messages Reduce warning message output to one line only instead of per cpu. Signed-of-by: Mike Travis Cc: Rusty Russell Cc: Frederic Weisbecker Cc: Brian Gerst Cc: x86@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c9b3522b6b46..4e8cb4ee9fcb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -600,7 +600,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk(KERN_WARNING "WARNING: polling idle and HT enabled," + printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," " performance may degrade.\n"); } #endif -- cgit v1.2.2 From 45e16a6834b6af098702e5ea6c9a40de42ff77d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Mar 2010 13:40:30 +0100 Subject: perf, x86: Fix hw_perf_enable() event assignment What happens is that we schedule badly like: <...>-1987 [019] 280.252808: x86_pmu_start: event-46/1300c0: idx: 0 <...>-1987 [019] 280.252811: x86_pmu_start: event-47/1300c0: idx: 1 <...>-1987 [019] 280.252812: x86_pmu_start: event-48/1300c0: idx: 2 <...>-1987 [019] 280.252813: x86_pmu_start: event-49/1300c0: idx: 3 <...>-1987 [019] 280.252814: x86_pmu_start: event-50/1300c0: idx: 32 <...>-1987 [019] 280.252825: x86_pmu_stop: event-46/1300c0: idx: 0 <...>-1987 [019] 280.252826: x86_pmu_stop: event-47/1300c0: idx: 1 <...>-1987 [019] 280.252827: x86_pmu_stop: event-48/1300c0: idx: 2 <...>-1987 [019] 280.252828: x86_pmu_stop: event-49/1300c0: idx: 3 <...>-1987 [019] 280.252829: x86_pmu_stop: event-50/1300c0: idx: 32 <...>-1987 [019] 280.252834: x86_pmu_start: event-47/1300c0: idx: 1 <...>-1987 [019] 280.252834: x86_pmu_start: event-48/1300c0: idx: 2 <...>-1987 [019] 280.252835: x86_pmu_start: event-49/1300c0: idx: 3 <...>-1987 [019] 280.252836: x86_pmu_start: event-50/1300c0: idx: 32 <...>-1987 [019] 280.252837: x86_pmu_start: event-51/1300c0: idx: 32 *FAIL* This happens because we only iterate the n_running events in the first pass, and reset their index to -1 if they don't match to force a re-assignment. Now, in our RR example, n_running == 0 because we fully unscheduled, so event-50 will retain its idx==32, even though in scheduling it will have gotten idx=0, and we don't trigger the re-assign path. The easiest way to fix this is the below patch, which simply validates the full assignment in the second pass. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1268311069.5037.31.camel@laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c6bde7d7afdc..5fb490c6ee5c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -811,7 +811,6 @@ void hw_perf_enable(void) * step2: reprogram moved events into new counters */ for (i = 0; i < n_running; i++) { - event = cpuc->event_list[i]; hwc = &event->hw; @@ -826,21 +825,16 @@ void hw_perf_enable(void) continue; x86_pmu_stop(event); - - hwc->idx = -1; } for (i = 0; i < cpuc->n_events; i++) { - event = cpuc->event_list[i]; hwc = &event->hw; - if (i < n_running && - match_prev_assignment(hwc, cpuc, i)) - continue; - - if (hwc->idx == -1) + if (!match_prev_assignment(hwc, cpuc, i)) x86_assign_hw_event(event, cpuc, i); + else if (i < n_running) + continue; x86_pmu_start(event); } -- cgit v1.2.2 From 639fe4b12f92b54c9c3b38c82cdafaa38cfd3e63 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 11 Mar 2010 15:30:35 +0800 Subject: perf: export perf_trace_regs and perf_arch_fetch_caller_regs Export perf_trace_regs and perf_arch_fetch_caller_regs since module will use these. Signed-off-by: Xiao Guangrong [ use EXPORT_PER_CPU_SYMBOL_GPL() ] Signed-off-by: Peter Zijlstra LKML-Reference: <4B989C1B.2090407@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5fb490c6ee5c..7645faea8e85 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1713,3 +1713,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski regs->cs = __KERNEL_CS; local_save_flags(regs->flags); } +EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); -- cgit v1.2.2 From 8447b360a3897bdfb0677107564d1dd9ab6e63be Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 11 Mar 2010 12:43:29 -0600 Subject: x86, UV: Fix target_cpus() in x2apic_uv_x.c target_cpu() should initially target all cpus, not just cpu 0. Otherwise systems with lots of disks can exhaust the interrupt vectors on cpu 0 if a large number of disks are discovered before the irq balancer is running. Note: UV code only... Signed-off-by: Jack Steiner LKML-Reference: <20100311184328.GA21433@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 21db3cbea7dc..af0ca80e38a9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -114,11 +114,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - static const struct cpumask *uv_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) -- cgit v1.2.2 From 5d0e52830e9ae09b872567f4aca3dfb5b5918079 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:13 -0800 Subject: Add generic sys_old_select() Add a generic implementation of the old select() syscall, which expects its argument in a memory block and switch all architectures over to use it. Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Reviewed-by: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Acked-by: Andreas Schwab Acked-by: Russell King Acked-by: Greg Ungerer Acked-by: David Howells Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/ia32/sys_ia32.c | 18 ------------------ arch/x86/include/asm/sys_ia32.h | 2 -- arch/x86/include/asm/syscalls.h | 2 -- arch/x86/include/asm/unistd_32.h | 1 + arch/x86/kernel/sys_i386_32.c | 17 ----------------- arch/x86/kernel/syscall_table_32.S | 2 +- 7 files changed, 3 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 53147ad85b96..34f821802c23 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -586,7 +586,7 @@ ia32_sys_call_table: .quad compat_sys_settimeofday .quad sys_getgroups16 /* 80 */ .quad sys_setgroups16 - .quad sys32_old_select + .quad compat_sys_old_select .quad sys_symlink .quad sys_lstat .quad sys_readlink /* 85 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 422572c77923..cb80816e7a16 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -332,24 +332,6 @@ asmlinkage long sys32_alarm(unsigned int seconds) return alarm_setitimer(seconds); } -struct sel_arg_struct { - unsigned int n; - unsigned int inp; - unsigned int outp; - unsigned int exp; - unsigned int tvp; -}; - -asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg) -{ - struct sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), - compat_ptr(a.exp), compat_ptr(a.tvp)); -} - asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) { diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index d5f69045c100..b26fc750e416 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -40,8 +40,6 @@ asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *, compat_sigset_t __user *, unsigned int); asmlinkage long sys32_alarm(unsigned int); -struct sel_arg_struct; -asmlinkage long sys32_old_select(struct sel_arg_struct __user *); asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); asmlinkage long sys32_sysfs(int, u32, u32); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 8868b9420b0e..8406d06c118d 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -52,12 +52,10 @@ unsigned long sys_sigreturn(struct pt_regs *); /* kernel/sys_i386_32.c */ struct mmap_arg_struct; -struct sel_arg_struct; struct oldold_utsname; struct old_utsname; asmlinkage int old_mmap(struct mmap_arg_struct __user *); -asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); asmlinkage int sys_uname(struct old_utsname __user *); asmlinkage int sys_olduname(struct oldold_utsname __user *); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 3baf379fa840..4eb2667b54ae 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -366,6 +366,7 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_SELECT #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index dee1ff7cba58..345dbd19a2b3 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -58,23 +58,6 @@ out: return err; } - -struct sel_arg_struct { - unsigned long n; - fd_set __user *inp, *outp, *exp; - struct timeval __user *tvp; -}; - -asmlinkage int old_select(struct sel_arg_struct __user *arg) -{ - struct sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - /* sys_select() does the appropriate kernel locking */ - return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); -} - /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. * diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 15228b5d3eb7..4d10abacecdb 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -81,7 +81,7 @@ ENTRY(sys_call_table) .long sys_settimeofday .long sys_getgroups16 /* 80 */ .long sys_setgroups16 - .long old_select + .long sys_old_select .long sys_symlink .long sys_lstat .long sys_readlink /* 85 */ -- cgit v1.2.2 From a4679373cf4ee0e7792dc56205365732b725c2c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:15 -0800 Subject: Add generic sys_old_mmap() Add a generic implementation of the old mmap() syscall, which expects its argument in a memory block and switch all architectures over to use it. Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Reviewed-by: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Acked-by: Jesper Nilsson Acked-by: Russell King Acked-by: Greg Ungerer Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/sys_ia32.c | 6 +++--- arch/x86/include/asm/sys_ia32.h | 4 ++-- arch/x86/include/asm/syscalls.h | 2 -- arch/x86/include/asm/unistd_32.h | 1 + arch/x86/kernel/sys_i386_32.c | 34 ---------------------------------- arch/x86/kernel/syscall_table_32.S | 2 +- 6 files changed, 7 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index cb80816e7a16..56c99f46e289 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -143,7 +143,7 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename, * block for parameter passing.. */ -struct mmap_arg_struct { +struct mmap_arg_struct32 { unsigned int addr; unsigned int len; unsigned int prot; @@ -152,9 +152,9 @@ struct mmap_arg_struct { unsigned int offset; }; -asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) +asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg) { - struct mmap_arg_struct a; + struct mmap_arg_struct32 a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index b26fc750e416..7d348d803669 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -26,8 +26,8 @@ asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *); asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); asmlinkage long sys32_fstatat(unsigned int, char __user *, struct stat64 __user *, int); -struct mmap_arg_struct; -asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); +struct mmap_arg_struct32; +asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); struct sigaction32; diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 8406d06c118d..86ab6a0623fd 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -51,11 +51,9 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, unsigned long sys_sigreturn(struct pt_regs *); /* kernel/sys_i386_32.c */ -struct mmap_arg_struct; struct oldold_utsname; struct old_utsname; -asmlinkage int old_mmap(struct mmap_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); asmlinkage int sys_uname(struct old_utsname __user *); asmlinkage int sys_olduname(struct oldold_utsname __user *); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 4eb2667b54ae..daa65d9aae95 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -366,6 +366,7 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_MMAP #define __ARCH_WANT_SYS_OLD_SELECT #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 345dbd19a2b3..7955e90c8341 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,40 +24,6 @@ #include -/* - * Perform the select(nd, in, out, ex, tv) and mmap() system - * calls. Linux/i386 didn't use to be able to handle more than - * 4 system call parameters, so these system calls used a memory - * block for parameter passing.. - */ - -struct mmap_arg_struct { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; -}; - -asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) -{ - struct mmap_arg_struct a; - int err = -EFAULT; - - if (copy_from_user(&a, arg, sizeof(a))) - goto out; - - err = -EINVAL; - if (a.offset & ~PAGE_MASK) - goto out; - - err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, - a.fd, a.offset >> PAGE_SHIFT); -out: - return err; -} - /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. * diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 4d10abacecdb..8b3729341216 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -89,7 +89,7 @@ ENTRY(sys_call_table) .long sys_swapon .long sys_reboot .long sys_old_readdir - .long old_mmap /* 90 */ + .long sys_old_mmap /* 90 */ .long sys_munmap .long sys_truncate .long sys_ftruncate -- cgit v1.2.2 From baed7fc9b580bd3fb8252ff1d9b36eaf1f86b670 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:18 -0800 Subject: Add generic sys_ipc wrapper Add a generic implementation of the ipc demultiplexer syscall. Except for s390 and sparc64 all implementations of the sys_ipc are nearly identical. There are slight differences in the types of the parameters, where mips and powerpc as the only 64-bit architectures with sys_ipc use unsigned long for the "third" argument as it gets casted to a pointer later, while it traditionally is an "int" like most other paramters. frv goes even further and uses unsigned long for all parameters execept for "ptr" which is a pointer type everywhere. The change from int to unsigned long for "third" and back to "int" for the others on frv should be fine due to the in-register calling conventions for syscalls (we already had a similar issue with the generic sys_ptrace), but I'd prefer to have the arch maintainers looks over this in details. Except for that h8300, m68k and m68knommu lack an impplementation of the semtimedop sub call which this patch adds, and various architectures have gets used - at least on i386 it seems superflous as the compat code on x86-64 and ia64 doesn't even bother to implement it. [akpm@linux-foundation.org: add sys_ipc to sys_ni.c] Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Reviewed-by: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Acked-by: Jesper Nilsson Acked-by: Russell King Acked-by: David Howells Acked-by: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/syscalls.h | 1 - arch/x86/include/asm/unistd_32.h | 1 + arch/x86/kernel/sys_i386_32.c | 85 ---------------------------------------- 3 files changed, 1 insertion(+), 86 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 86ab6a0623fd..50f6a569f0d1 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -54,7 +54,6 @@ unsigned long sys_sigreturn(struct pt_regs *); struct oldold_utsname; struct old_utsname; -asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); asmlinkage int sys_uname(struct old_utsname __user *); asmlinkage int sys_olduname(struct oldold_utsname __user *); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index daa65d9aae95..45e64a17b86e 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -354,6 +354,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 7955e90c8341..8b5c348fdcf2 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,91 +24,6 @@ #include -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc(uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); - - case SEMGET: - return sys_semget(first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl(first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd(first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof(tmp))) - return -EFAULT; - return sys_msgrcv(first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv(first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget((key_t) first, second); - case MSGCTL: - return sys_msgctl(first, second, (struct msqid_ds __user *) ptr); - - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat(first, (char __user *) ptr, second, &raddr); - if (ret) - return ret; - return put_user(raddr, (ulong __user *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ - return do_shmat(first, (char __user *) ptr, second, (ulong *) third); - } - case SHMDT: - return sys_shmdt((char __user *)ptr); - case SHMGET: - return sys_shmget(first, second, third); - case SHMCTL: - return sys_shmctl(first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} - /* * Old cruft */ -- cgit v1.2.2 From e28cbf22933d0c0ccaf3c4c27a1a263b41f73859 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:19 -0800 Subject: improve sys_newuname() for compat architectures On an architecture that supports 32-bit compat we need to override the reported machine in uname with the 32-bit value. Instead of doing this separately in every architecture introduce a COMPAT_UTS_MACHINE define in and apply it directly in sys_newuname(). Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/compat.h | 3 ++- arch/x86/include/asm/syscalls.h | 3 --- arch/x86/include/asm/unistd_64.h | 2 +- arch/x86/kernel/sys_x86_64.c | 12 ------------ 4 files changed, 3 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 9a9c7bdc923d..306160e58b48 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -8,7 +8,8 @@ #include #include -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "i686\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 50f6a569f0d1..47cd606c3537 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -68,11 +68,8 @@ int sys_vm86(unsigned long, unsigned long, struct pt_regs *); long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ -struct new_utsname; - asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -asmlinkage long sys_uname(struct new_utsname __user *); #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALLS_H */ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 4843f7ba754a..83e2d6dc5038 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -146,7 +146,7 @@ __SYSCALL(__NR_wait4, sys_wait4) #define __NR_kill 62 __SYSCALL(__NR_kill, sys_kill) #define __NR_uname 63 -__SYSCALL(__NR_uname, sys_uname) +__SYSCALL(__NR_uname, sys_newuname) #define __NR_semget 64 __SYSCALL(__NR_semget, sys_semget) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8aa2057efd12..ff14a5044ce6 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -209,15 +209,3 @@ bottomup: return addr; } - - -SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) -{ - int err; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - return err ? -EFAULT : 0; -} -- cgit v1.2.2 From 5cacdb4add1b1e50fe75edc50ebbb7bddd9cf5e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:21 -0800 Subject: Add generic sys_olduname() Add generic implementations of the old and really old uname system calls. Note that sh only implements sys_olduname but not sys_oldolduname, but I'm not going to bother with another ifdef for that special case. m32r implemented an old uname but never wired it up, so kill it, too. Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 4 ++-- arch/x86/ia32/sys_ia32.c | 52 ---------------------------------------- arch/x86/include/asm/sys_ia32.h | 5 ---- arch/x86/include/asm/syscalls.h | 7 ------ arch/x86/include/asm/unistd_32.h | 1 + arch/x86/include/asm/unistd_64.h | 1 + arch/x86/kernel/sys_i386_32.c | 49 ------------------------------------- 7 files changed, 4 insertions(+), 115 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 34f821802c23..59b4556a5b92 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -563,7 +563,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* old mpx syscall holder */ .quad sys_setpgid .quad quiet_ni_syscall /* old ulimit syscall holder */ - .quad sys32_olduname + .quad sys_olduname .quad sys_umask /* 60 */ .quad sys_chroot .quad compat_sys_ustat @@ -613,7 +613,7 @@ ia32_sys_call_table: .quad compat_sys_newstat .quad compat_sys_newlstat .quad compat_sys_newfstat - .quad sys32_uname + .quad sys_uname .quad stub32_iopl /* 110 */ .quad sys_vhangup .quad quiet_ni_syscall /* old "idle" system call */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 56c99f46e289..74c35431b7d8 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -448,58 +448,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_olduname(struct oldold_utsname __user *name) -{ - char *arch = "x86_64"; - int err; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - err = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - err |= __put_user(0, name->sysname+__OLD_UTS_LEN); - err |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - err |= __put_user(0, name->nodename+__OLD_UTS_LEN); - err |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - err |= __put_user(0, name->release+__OLD_UTS_LEN); - err |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - err |= __put_user(0, name->version+__OLD_UTS_LEN); - - if (personality(current->personality) == PER_LINUX32) - arch = "i686"; - - err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1); - - up_read(&uts_sem); - - err = err ? -EFAULT : 0; - - return err; -} - -long sys32_uname(struct old_utsname __user *name) -{ - int err; - - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - - return err ? -EFAULT : 0; -} - asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, compat_uptr_t __user *envp, struct pt_regs *regs) { diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 7d348d803669..3ad421784ae7 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -54,11 +54,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -struct oldold_utsname; -struct old_utsname; -asmlinkage long sys32_olduname(struct oldold_utsname __user *); -long sys32_uname(struct old_utsname __user *); - asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, compat_uptr_t __user *, struct pt_regs *); asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 47cd606c3537..5c044b43e9a7 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -50,13 +50,6 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); unsigned long sys_sigreturn(struct pt_regs *); -/* kernel/sys_i386_32.c */ -struct oldold_utsname; -struct old_utsname; - -asmlinkage int sys_uname(struct old_utsname __user *); -asmlinkage int sys_olduname(struct oldold_utsname __user *); - /* kernel/vm86_32.c */ int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); int sys_vm86(unsigned long, unsigned long, struct pt_regs *); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 45e64a17b86e..beb9b5f8f8a4 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -367,6 +367,7 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLD_MMAP #define __ARCH_WANT_SYS_OLD_SELECT #define __ARCH_WANT_SYS_OLDUMOUNT diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 83e2d6dc5038..ff4307b0e81e 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -680,6 +680,7 @@ __SYSCALL(__NR_recvmmsg, sys_recvmmsg) #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 8b5c348fdcf2..196552bb412c 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,55 +24,6 @@ #include -/* - * Old cruft - */ -asmlinkage int sys_uname(struct old_utsname __user *name) -{ - int err; - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - return err? -EFAULT:0; -} - -asmlinkage int sys_olduname(struct oldold_utsname __user *name) -{ - int error; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); - - up_read(&uts_sem); - - error = error ? -EFAULT : 0; - - return error; -} - - /* * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. -- cgit v1.2.2 From dacbe41f776db0a5a9aee1e41594f405c95778a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:22:46 -0800 Subject: ptrace: move user_enable_single_step & co prototypes to linux/ptrace.h While in theory user_enable_single_step/user_disable_single_step/ user_enable_blockstep could also be provided as an inline or macro there's no good reason to do so, and having the prototype in one places keeps code size and confusion down. Roland said: The original thought there was that user_enable_single_step() et al might well be only an instruction or three on a sane machine (as if we have any of those!), and since there is only one call site inlining would be beneficial. But I agree that there is no strong reason to care about inlining it. As to the arch changes, there is only one thought I'd add to the record. It was always my thinking that for an arch where PTRACE_SINGLESTEP does text-modifying breakpoint insertion, user_enable_single_step() should not be provided. That is, arch_has_single_step()=>true means that there is an arch facility with "pure" semantics that does not have any unexpected side effects. Inserting a breakpoint might do very unexpected strange things in multi-threaded situations. Aside from that, it is a peculiar side effect that user_{enable,disable}_single_step() should cause COW de-sharing of text pages and so forth. For PTRACE_SINGLESTEP, all these peculiarities are the status quo ante for that arch, so having arch_ptrace() itself do those is one thing. But for building other things in the future, it is nicer to have a uniform "pure" semantics that arch-independent code can expect. OTOH, all such arch issues are really up to the arch maintainer. As of today, there is nothing but ptrace using user_enable_single_step() et al so it's a distinction without a practical difference. If/when there are other facilities that use user_enable_single_step() and might care, the affected arch's can revisit the question when someone cares about the quality of the arch support for said new facility. Signed-off-by: Christoph Hellwig Cc: Oleg Nesterov Cc: Roland McGrath Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/ptrace.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 20102808b191..69a686a7dff0 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -274,14 +274,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, return 0; } -/* - * These are defined as per linux/ptrace.h, which see. - */ #define arch_has_single_step() (1) -extern void user_enable_single_step(struct task_struct *); -extern void user_disable_single_step(struct task_struct *); - -extern void user_enable_block_step(struct task_struct *); #ifdef CONFIG_X86_DEBUGCTLMSR #define arch_has_block_step() (1) #else -- cgit v1.2.2 From 3bc4e4590de89c2dfcfb1000344cd072574c9ad4 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 10 Mar 2010 15:23:22 -0800 Subject: pci-dma: x86: use include/linux/pci-dma.h Signed-off-by: FUJITA Tomonori Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 3 +++ arch/x86/include/asm/pci.h | 30 ++---------------------------- 2 files changed, 5 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e98440371525..93936de67796 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -102,6 +102,9 @@ config ZONE_DMA config SBUS bool +config NEED_DMA_MAP_STATE + def_bool (X86_64 || DMAR || DMA_API_DEBUG) + config GENERIC_ISA_DMA def_bool y diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 3e002ca5a287..e2655dc9b9cd 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -97,34 +97,6 @@ extern void pci_iommu_alloc(void); #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) -#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ - dma_addr_t ADDR_NAME; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ - __u32 LEN_NAME; -#define pci_unmap_addr(PTR, ADDR_NAME) \ - ((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - (((PTR)->ADDR_NAME) = (VAL)) -#define pci_unmap_len(PTR, LEN_NAME) \ - ((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - (((PTR)->LEN_NAME) = (VAL)) - -#else - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0]; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; -#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) -#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - do { break; } while (pci_unmap_len(PTR, LEN_NAME)) - -#endif - #endif /* __KERNEL__ */ #ifdef CONFIG_X86_64 @@ -136,6 +108,8 @@ void dma32_reserve_bootmem(void); /* implement the pci_ DMA API in terms of the generic device dma_ one */ #include +#include + /* generic pci stuff */ #include #define PCIBIOS_MAX_MEM_32 0xffffffff -- cgit v1.2.2 From f41b177157718abe9a93868bb76e47d4a6f3681d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 10 Mar 2010 15:23:30 -0800 Subject: pci-dma: add linux/pci-dma.h to linux/pci.h All the architectures properly set NEED_DMA_MAP_STATE now so we can safely add linux/pci-dma.h to linux/pci.h and remove the linux/pci-dma.h inclusion in arch's asm/pci.h Signed-off-by: FUJITA Tomonori Acked-by: Arnd Bergmann Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index e2655dc9b9cd..404a880ea325 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -108,8 +108,6 @@ void dma32_reserve_bootmem(void); /* implement the pci_ DMA API in terms of the generic device dma_ one */ #include -#include - /* generic pci stuff */ #include #define PCIBIOS_MAX_MEM_32 0xffffffff -- cgit v1.2.2 From 0e152cd7c16832bd5cadee0c2e41d9959bc9b6f9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 12 Mar 2010 15:43:03 +0100 Subject: x86, k8 nb: Fix boot crash: enable k8_northbridges unconditionally on AMD systems de957628ce7c84764ff41331111036b3ae5bad0f changed setting of the x86_init.iommu.iommu_init function ptr only when GART IOMMU is found. One side effect of it is that num_k8_northbridges is not initialized anymore if not explicitly called. This resulted in uninitialized pointers in , for example, which uses the num_k8_northbridges thing through node_to_k8_nb_misc(). Fix that through an initcall that runs right after the PCI subsystem and does all the scanning. Then, remove initialization in gart_iommu_init() which is a rootfs_initcall and we're running before that. What is more, since num_k8_northbridges is being used in other places beside GART IOMMU, include it whenever we add AMD CPU support. The previous dependency chain in kconfig contained K8_NB depends on AGP_AMD64|GART_IOMMU which was clearly incorrect. The more natural way in terms of hardware dependency should be AGP_AMD64|GART_IOMMU depends on K8_NB depends on CPU_SUP_AMD && PCI. Make it so Number One! Signed-off-by: Borislav Petkov Cc: FUJITA Tomonori Cc: Joerg Roedel LKML-Reference: <20100312144303.GA29262@aftab> Signed-off-by: Ingo Molnar Tested-by: Joerg Roedel --- arch/x86/Kconfig | 4 ++-- arch/x86/kernel/k8.c | 14 ++++++++++++++ arch/x86/kernel/pci-gart_64.c | 2 +- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb4092568f9e..ddb52b8d38a7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -627,7 +627,7 @@ config GART_IOMMU bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - depends on X86_64 && PCI + depends on X86_64 && PCI && K8_NB ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -2026,7 +2026,7 @@ endif # X86_32 config K8_NB def_bool y - depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) + depends on CPU_SUP_AMD && PCI source "drivers/pcmcia/Kconfig" diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index cbc4332a77b2..9b895464dd03 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -121,3 +121,17 @@ void k8_flush_garts(void) } EXPORT_SYMBOL_GPL(k8_flush_garts); +static __init int init_k8_nbs(void) +{ + int err = 0; + + err = cache_k8_northbridges(); + + if (err < 0) + printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + + return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_k8_nbs); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 34de53b46f87..f3af115a573a 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -735,7 +735,7 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) + if (num_k8_northbridges == 0) return 0; #ifndef CONFIG_AGP_AMD64 -- cgit v1.2.2 From 2aa2b50dd62b5d0675bd7453fbeb5732dc2d7866 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Mar 2010 08:57:03 +0100 Subject: x86/mce: Fix build bug with CONFIG_PROVE_LOCKING=y && CONFIG_X86_MCE_INTEL=y Commit f56e8a076 "x86/mce: Fix RCU lockdep splats" introduced the following build bug: arch/x86/kernel/cpu/mcheck/mce.c: In function 'mce_log': arch/x86/kernel/cpu/mcheck/mce.c:166: error: 'mce_read_mutex' undeclared (first use in this function) arch/x86/kernel/cpu/mcheck/mce.c:166: error: (Each undeclared identifier is reported only once arch/x86/kernel/cpu/mcheck/mce.c:166: error: for each function it appears in.) Move the in-the-middle-of-file lock variable up to the variable definition section, the top of the .c file. Cc: Paul E. McKenney Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267830207-9474-3-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index bd58de4d7a29..3ab9c886b613 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,8 @@ #include "mce-internal.h" +static DEFINE_MUTEX(mce_read_mutex); + #define rcu_dereference_check_mce(p) \ rcu_dereference_check((p), \ rcu_read_lock_sched_held() || \ @@ -1490,8 +1492,6 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } -static DEFINE_MUTEX(mce_read_mutex); - static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { -- cgit v1.2.2 From 8144c880397d502d12af4ef721f3eac50163fa39 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 18 Feb 2010 23:42:47 -0500 Subject: ACPI: remove "acpi=ht" DMI blacklist SuSE added these entries when deploying ACPI in Linux-2.4. I pulled them into Linux-2.6 on 2003-08-09. Over the last 6+ years, several entries have proven to be unnecessary and deleted, while no new entries have been added. Matthew suggests that they now have negative value, and I agree. Based-on-patch-by: Matthew Garrett Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 93 --------------------------------------------- 1 file changed, 93 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a54d714545ff..df586bbc9447 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1292,23 +1292,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) return 0; } -/* - * Limit ACPI to CPU enumeration for HT - */ -static int __init force_acpi_ht(const struct dmi_system_id *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", - d->ident); - disable_acpi(); - acpi_ht = 1; - } else { - printk(KERN_NOTICE - "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); - } - return 0; -} - /* * Force ignoring BIOS IRQ0 pin2 override */ @@ -1344,82 +1327,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, }, - /* - * Boxes that need acpi=ht - */ - { - .callback = force_acpi_ht, - .ident = "FSC Primergy T850", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), - DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "HP VISUALIZE NT Workstation", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "Compaq Workstation W8000", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), - DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ASUS CUR-DLS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ABIT i440BX-W83977", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ABIT "), - DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM Bladecenter", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eServer xSeries 360", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eserver xSeries 330", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eserver xSeries 440", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), - }, - }, - /* * Boxes that need ACPI PCI IRQ routing disabled */ -- cgit v1.2.2 From 4c81ba4900ab4eb24c7d2ba1aca594c644b6ce4c Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 14 Mar 2010 16:28:46 -0400 Subject: ACPI: plan to delete "acpi=ht" boot option Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index df586bbc9447..7914ab0ad76e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1559,8 +1559,10 @@ static int __init parse_acpi(char *arg) } /* Limit ACPI just to boot-time to enable HT */ else if (strcmp(arg, "ht") == 0) { - if (!acpi_force) + if (!acpi_force) { + printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n"); disable_acpi(); + } acpi_ht = 1; } /* acpi=rsdt use RSDT instead of XSDT */ -- cgit v1.2.2 From d8191fa4a33fdc817277da4f2b7f771ff605a41c Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Mon, 22 Feb 2010 12:11:39 -0700 Subject: ACPI: processor: driver doesn't need to evaluate _PDC Now that the early _PDC evaluation path knows how to correctly evaluate _PDC on only physically present processors, there's no need for the processor driver to evaluate it later when it loads. To cover the hotplug case, push _PDC evaluation down into the hotplug paths. Cc: x86@kernel.org Cc: Tony Luck Acked-by: Venkatesh Pallipadi Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a54d714545ff..d635a93ae59c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -490,6 +490,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) * ACPI based hotplug support for CPU */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +#include static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { @@ -567,6 +568,8 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) goto free_new_map; } + acpi_processor_set_pdc(handle); + cpu = cpumask_first(new_map); acpi_map_cpu2node(handle, cpu, physid); -- cgit v1.2.2 From 36e9e1eab777e077f7484d309ff676d0568e27d1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 15 Mar 2010 14:33:06 -0800 Subject: x86: Handle legacy PIC interrupts on all the cpu's Ingo Molnar reported that with the recent changes of not statically blocking IRQ0_VECTOR..IRQ15_VECTOR's on all the cpu's, broke an AMD platform (with Nvidia chipset) boot when "noapic" boot option is used. On this platform, legacy PIC interrupts are getting delivered to all the cpu's instead of just the boot cpu. Thus not initializing the vector to irq mapping for the legacy irq's resulted in not handling certain interrupts causing boot hang. Fix this by initializing the vector to irq mapping on all the logical cpu's, if the legacy IRQ is handled by the legacy PIC. Reported-by: Ingo Molnar Signed-off-by: Suresh Siddha [ -v2: io-apic-enabled improvement ] Acked-by: Yinghai Lu Cc: Eric W. Biederman LKML-Reference: <1268692386.3296.43.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 1 + arch/x86/kernel/apic/io_apic.c | 8 ++++++++ arch/x86/kernel/irqinit.c | 22 ++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 2 +- 4 files changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a929c9ede33d..46c0fe05f230 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -133,6 +133,7 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); +extern void setup_vector_irq(int cpu); #ifdef CONFIG_X86_IO_APIC extern void lock_vector_lock(void); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e4e0ddcb1546..463de9a858ad 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1268,6 +1268,14 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; + + /* + * If it is a legacy IRQ handled by the legacy PIC, this cpu + * will be part of the irq_cfg's domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) + cpumask_set_cpu(cpu, cfg->domain); + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index ef257fc2921b..f01d390f9c5b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -141,6 +141,28 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } +/* + * Setup the vector to irq mappings. + */ +void setup_vector_irq(int cpu) +{ +#ifndef CONFIG_X86_IO_APIC + int irq; + + /* + * On most of the platforms, legacy PIC delivers the interrupts on the + * boot cpu. But there are certain platforms where PIC interrupts are + * delivered to multiple cpu's. If the legacy IRQ is handled by the + * legacy PIC, for the new cpu that is coming online, setup the static + * legacy vector to irq mapping: + */ + for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) + per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; +#endif + + __setup_vector_irq(cpu); +} + static void __init smp_intr_init(void) { #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a02e80c3c54b..06d98ae5a802 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -247,7 +247,7 @@ static void __cpuinit smp_callin(void) /* * Need to setup vector mappings before we enable interrupts. */ - __setup_vector_irq(smp_processor_id()); + setup_vector_irq(smp_processor_id()); /* * Get our bogomips. * -- cgit v1.2.2 From ff30a0543e9a6cd732582063e7cae951cdb7acf2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Mar 2010 10:11:15 +0000 Subject: x86: Fix placement of FIX_OHCI1394_BASE Ever for 32-bit with sufficiently high NR_CPUS, and starting with commit 789d03f584484af85dbdc64935270c8e45f36ef7 also for 64-bit, the statically allocated early fixmap page tables were not covering FIX_OHCI1394_BASE, leading to a boot time crash when "ohci1394_dma=early" was used. Despite this entry not being a permanently used one, it needs to be moved into the permanent range since it has to be close to FIX_DBGP_BASE and FIX_EARLYCON_MEM_BASE. Reported-bisected-and-tested-by: Justin P. Mattock Fixes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=14487 Signed-off-by: Jan Beulich Cc: # [as far back as long as it still applies] LKML-Reference: <4B9E15D30200007800034D23@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 635f03bb4995..d07b44f7d1dc 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -82,6 +82,9 @@ enum fixed_addresses { #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -132,9 +135,6 @@ enum fixed_addresses { (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1)) : __end_of_permanent_fixed_addresses, FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif -- cgit v1.2.2 From dcd5c1662db59a6b82942f47fb6ac9dd63f6d3dd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 16 Mar 2010 01:05:02 +0100 Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs perf_arch_fetch_caller_regs() is exported for the overriden x86 version, but not for the generic weak version. As a general rule, weak functions should not have their symbol exported in the same file they are defined. So let's export it on trace_event_perf.c as it is used by trace events only. This fixes: ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined! ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined! -v2: And also only build it if trace events are enabled. -v3: Fix changelog mistake Reported-by: Stephen Rothwell Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Paul Mackerras LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 7645faea8e85..60398a0d947c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1702,6 +1702,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } +#ifdef CONFIG_EVENT_TRACING void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { regs->ip = ip; @@ -1713,4 +1714,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski regs->cs = __KERNEL_CS; local_save_flags(regs->flags); } -EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); +#endif -- cgit v1.2.2 From 035a02c1e1de31888e8b6adac0ff667971ac04db Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Mar 2010 12:09:22 +0100 Subject: x86, amd: Restrict usage of c1e_idle() Currently c1e_idle returns true for all CPUs greater than or equal to family 0xf model 0x40. This covers too many CPUs. Meanwhile a respective erratum for the underlying problem was filed (#400). This patch adds the logic to check whether erratum #400 applies to a given CPU. Especially for CPUs where SMI/HW triggered C1e is not supported, c1e_idle() doesn't need to be used. We can check this by looking at the respective OSVW bit for erratum #400. Cc: # .32.x .33.x Signed-off-by: Andreas Herrmann LKML-Reference: <20100319110922.GA19614@alberich.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/process.c | 32 ++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1cd58cdbc03f..4604e6a54d36 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -105,6 +105,8 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ad9540676fcc..28ad9f4d8b94 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -526,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) } /* - * Check for AMD CPUs, which have potentially C1E support + * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. + * For more information see + * - Erratum #400 for NPT family 0xf and family 0x10 CPUs + * - Erratum #365 for family 0x11 (not affected because C1e not in use) */ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) { + u64 val; if (c->x86_vendor != X86_VENDOR_AMD) - return 0; - - if (c->x86 < 0x0F) - return 0; + goto no_c1e_idle; /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; + if (c->x86 == 0x0F && c->x86_model >= 0x40) + return 1; - return 1; + if (c->x86 == 0x10) { + /* + * check OSVW bit for CPUs that are not affected + * by erratum #400 + */ + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); + if (val >= 2) { + rdmsrl(MSR_AMD64_OSVW_STATUS, val); + if (!(val & BIT(1))) + goto no_c1e_idle; + } + return 1; + } + +no_c1e_idle: + return 0; } static cpumask_var_t c1e_mask; -- cgit v1.2.2 From a90110c61073eab95d1986322693c2b9a8a6a5f6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 21 Mar 2010 21:51:51 +0100 Subject: x86 / perf: Fix suspend to RAM on HP nx6325 Commit 3f6da3905398826d85731247e7fbcf53400c18bd (perf: Rework and fix the arch CPU-hotplug hooks) broke suspend to RAM on my HP nx6325 (and most likely on other AMD-based boxes too) by allowing amd_pmu_cpu_offline() to be executed for CPUs that are going offline as part of the suspend process. The problem is that cpuhw->amd_nb may be NULL already, so the function should make sure it's not NULL before accessing the object pointed to by it. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_amd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 573458f1caf2..b87e0b6970cb 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -348,10 +348,12 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_lock(&amd_nb_lock); - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); + if (cpuhw->amd_nb) { + if (--cpuhw->amd_nb->refcnt == 0) + kfree(cpuhw->amd_nb); - cpuhw->amd_nb = NULL; + cpuhw->amd_nb = NULL; + } raw_spin_unlock(&amd_nb_lock); } -- cgit v1.2.2 From c9c9b564717e5b6b2ae8b770da1c73a348c84cce Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 22 Mar 2010 16:34:10 -0600 Subject: x86/PCI: remove redundant warnings pci_claim_resource() already prints more detailed error messages, so these are really redundant. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index dece3eb9c906..46fd43f79103 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -127,9 +127,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, - "can't reserve window %pR\n", - r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -181,8 +178,6 @@ static void __init pcibios_allocate_resources(int pass) "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, - "can't reserve %pR\n", r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; -- cgit v1.2.2 From eb9fc8ef7cb1362374e55d9503e3e7458f319991 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 25 Mar 2010 09:28:24 -0600 Subject: x86/PCI: for host bridge address space collisions, show conflicting resource With insert_resource_conflict(), we can learn what the actual conflict is, so print that info for debugging purposes. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 6e22454bfaa6..75ac3f856ea5 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -122,7 +122,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) struct acpi_resource_address64 addr; acpi_status status; unsigned long flags; - struct resource *root; + struct resource *root, *conflict; u64 start, end; status = resource_to_addr(acpi_res, &addr); @@ -157,9 +157,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } - if (insert_resource(root, res)) { + conflict = insert_resource_conflict(root, res); + if (conflict) { dev_err(&info->bridge->dev, - "can't allocate host bridge window %pR\n", res); + "address space collision: host bridge window %pR " + "conflicts with %s %pR\n", + res, conflict->name, conflict); } else { pci_bus_add_resource(info->bus, res, 0); info->res_num++; -- cgit v1.2.2 From d558b483d5a73f5718705b270cb2090f66ea48c8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 25 Mar 2010 09:28:30 -0600 Subject: x86/PCI: truncate _CRS windows with _LEN > _MAX - _MIN + 1 Yanko's GA-MA78GM-S2H (BIOS F11) reports the following resource in a PCI host bridge _CRS: [07] 32-Bit DWORD Address Space Resource Min Relocatability : MinFixed Max Relocatability : MaxFixed Address Minimum : CFF00000 (_MIN) Address Maximum : FEBFFFFF (_MAX) Address Length : 3EE10000 (_LEN) This is invalid per spec (ACPI 4.0, 6.4.3.5) because it's a fixed size, fixed location descriptor, but _LEN != _MAX - _MIN + 1. Based on https://bugzilla.kernel.org/show_bug.cgi?id=15480#c15, I think Windows handles this by truncating the window so it fits between _MIN and _MAX. I also verified this by modifying the SeaBIOS DSDT and booting Windows 2008 R2 with qemu. This patch makes Linux truncate the window, too, which fixes: http://bugzilla.kernel.org/show_bug.cgi?id=15480 Signed-off-by: Bjorn Helgaas Tested-by: Yanko Kaneti Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 75ac3f856ea5..e31160216efb 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -123,7 +123,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) acpi_status status; unsigned long flags; struct resource *root, *conflict; - u64 start, end; + u64 start, end, max_len; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -140,6 +140,17 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; + max_len = addr.maximum - addr.minimum + 1; + if (addr.address_length > max_len) { + dev_printk(KERN_DEBUG, &info->bridge->dev, + "host bridge window length %#llx doesn't fit in " + "%#llx-%#llx, trimming\n", + (unsigned long long) addr.address_length, + (unsigned long long) addr.minimum, + (unsigned long long) addr.maximum); + addr.address_length = max_len; + } + start = addr.minimum + addr.translation_offset; end = start + addr.address_length - 1; -- cgit v1.2.2 From 596b711ed6b5235f8545680ef38ace00f9898c32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 28 Mar 2010 19:42:54 -0700 Subject: x86: Make smp_locks end with page alignment Fix: ------------[ cut here ]------------ WARNING: at arch/x86/mm/init.c:342 free_init_pages+0x4c/0xfa() free_init_pages: range [0x40daf000, 0x40db5c24] is not aligned Modules linked in: Pid: 0, comm: swapper Not tainted 2.6.34-rc2-tip-03946-g4f16b23-dirty #50 Call Trace: [<40232e9f>] warn_slowpath_common+0x65/0x7c [<4021c9f0>] ? free_init_pages+0x4c/0xfa [<40881434>] ? _etext+0x0/0x24 [<40232eea>] warn_slowpath_fmt+0x24/0x27 [<4021c9f0>] free_init_pages+0x4c/0xfa [<40881434>] ? _etext+0x0/0x24 [<40d3f4bd>] alternative_instructions+0xf6/0x100 [<40d3fe4f>] check_bugs+0xbd/0xbf [<40d398a7>] start_kernel+0x2d5/0x2e4 [<40d390ce>] i386_start_kernel+0xce/0xd5 ---[ end trace 4eaa2a86a8e2da22 ]--- Comments in vmlinux.lds.S already said: | /* | * smp_locks might be freed after init | * start/end must be page aligned | */ Signed-off-by: Yinghai Lu Acked-by: Johannes Weiner Cc: David Miller Cc: Benjamin Herrenschmidt Cc: Linus Torvalds LKML-Reference: <1269830604-26214-2-git-send-email-yinghai@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 44879df55696..2cc249718c46 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,8 +291,8 @@ SECTIONS .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) - __smp_locks_end = .; . = ALIGN(PAGE_SIZE); + __smp_locks_end = .; } #ifdef CONFIG_X86_64 -- cgit v1.2.2 From c967da6a0ba837f762042e931d4afcf72045547c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 28 Mar 2010 19:42:55 -0700 Subject: x86: Make sure free_init_pages() frees pages on page boundary When CONFIG_NO_BOOTMEM=y, it could use memory more effiently, or in a more compact fashion. Example: Allocated new RAMDISK: 00ec2000 - 0248ce57 Move RAMDISK from 000000002ea04000 - 000000002ffcee56 to 00ec2000 - 0248ce56 The new RAMDISK's end is not page aligned. Last page could be shared with other users. When free_init_pages are called for initrd or .init, the page could be freed and we could corrupt other data. code segment in free_init_pages(): | for (; addr < end; addr += PAGE_SIZE) { | ClearPageReserved(virt_to_page(addr)); | init_page_count(virt_to_page(addr)); | memset((void *)(addr & ~(PAGE_SIZE-1)), | POISON_FREE_INITMEM, PAGE_SIZE); | free_page(addr); | totalram_pages++; | } last half page could be used as one whole free page. So page align the boundaries. -v2: make the original initramdisk to be aligned, according to Johannes, otherwise we have the chance to lose one page. we still need to keep initrd_end not aligned, otherwise it could confuse decompressor. -v3: change to WARN_ON instead, suggested by Johannes. -v4: use PAGE_ALIGN, suggested by Johannes. We may fix that macro name later to PAGE_ALIGN_UP, and PAGE_ALIGN_DOWN Add comments about assuming ramdisk start is aligned in relocate_initrd(), change to re get ramdisk_image instead of save it to make diff smaller. Add warning for wrong range, suggested by Johannes. -v6: remove one WARN() We need to align beginning in free_init_pages() do not copy more than ramdisk_size, noticed by Johannes Reported-by: Stanislaw Gruszka Tested-by: Stanislaw Gruszka Signed-off-by: Yinghai Lu Acked-by: Johannes Weiner Cc: David Miller Cc: Benjamin Herrenschmidt Cc: Linus Torvalds LKML-Reference: <1269830604-26214-3-git-send-email-yinghai@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head32.c | 4 +++- arch/x86/kernel/head64.c | 3 ++- arch/x86/kernel/setup.c | 10 ++++++---- arch/x86/mm/init.c | 32 ++++++++++++++++++++++++++------ 4 files changed, 37 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index adedeef1dedc..b2e246037392 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -44,9 +45,10 @@ void __init i386_start_kernel(void) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b5a9896ca1e7..7147143fd614 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d7ba1a449bd..d76e18570c60 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -314,16 +314,17 @@ static void __init reserve_brk(void) #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { - + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 area_size = PAGE_ALIGN(ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, + ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, PAGE_SIZE); if (ramdisk_here == -1ULL) @@ -332,7 +333,7 @@ static void __init relocate_initrd(void) /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, + reserve_early(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -376,9 +377,10 @@ static void __init relocate_initrd(void) static void __init reserve_initrd(void) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e71c5cbc8f35..452ee5b8f309 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -331,11 +331,23 @@ int devmem_is_allowed(unsigned long pagenr) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr = begin; + unsigned long addr; + unsigned long begin_aligned, end_aligned; - if (addr >= end) + /* Make sure boundaries are page aligned */ + begin_aligned = PAGE_ALIGN(begin); + end_aligned = end & PAGE_MASK; + + if (WARN_ON(begin_aligned != begin || end_aligned != end)) { + begin = begin_aligned; + end = end_aligned; + } + + if (begin >= end) return; + addr = begin; + /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will @@ -343,7 +355,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) */ #ifdef CONFIG_DEBUG_PAGEALLOC printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); + begin, end); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else /* @@ -358,8 +370,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } @@ -376,6 +387,15 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + /* + * end could be not aligned, and We can not align that, + * decompresser could be confused by aligned initrd_end + * We already reserve the end partial page before in + * - i386_start_kernel() + * - x86_64_start_kernel() + * - relocate_initrd() + * So here We can do PAGE_ALIGN() safely to get partial page to be freed + */ + free_init_pages("initrd memory", start, PAGE_ALIGN(end)); } #endif -- cgit v1.2.2 From 57f4c226d1e095a2db20c691c3cf089188fe1c5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Mar 2010 15:32:53 +0900 Subject: x86: don't include slab.h from arch/x86/include/asm/pgtable_32.h Including slab.h from x86 pgtable_32.h creates a troublesome dependency chain w/ ftrace enabled. The following chain leads to inclusion of pgtable_32.h from define_trace.h. trace/define_trace.h trace/ftrace.h linux/ftrace_event.h linux/ring_buffer.h linux/mm.h asm/pgtable.h asm/pgtable_32.h slab.h itself defines trace hooks via linux/sl[aou]b_def.h linux/kmemtrace.h trace/events/kmem.h If slab.h is not included before define_trace.h is included, this leads to duplicate definitions of kmemtrace hooks or other include dependency problems. pgtable_32.h doesn't need slab.h to begin with. Don't include it from there. Signed-off-by: Tejun Heo Acked-by: Pekka Enberg Acked-by: Christoph Lameter Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin --- arch/x86/include/asm/pgtable_32.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 47339a1ac7b6..2984a25ff383 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -19,7 +19,6 @@ #include #include -#include #include #include -- cgit v1.2.2 From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Mar 2010 17:04:11 +0900 Subject: include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo Guess-its-ok-by: Christoph Lameter Cc: Ingo Molnar Cc: Lee Schermerhorn --- arch/x86/crypto/fpu.c | 1 + arch/x86/ia32/ia32_aout.c | 1 - arch/x86/ia32/sys_ia32.c | 1 + arch/x86/kernel/acpi/boot.c | 1 + arch/x86/kernel/alternative.c | 1 + arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- arch/x86/kernel/apb_timer.c | 1 + arch/x86/kernel/apic/es7000_32.c | 1 + arch/x86/kernel/apic/io_apic.c | 1 + arch/x86/kernel/apic/nmi.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + arch/x86/kernel/bootflag.c | 1 - arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 1 + arch/x86/kernel/cpu/cpufreq/elanfreq.c | 1 - arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 1 + arch/x86/kernel/cpu/cpufreq/longrun.c | 1 - arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 - arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 1 + arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 1 - arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 1 + arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 1 - arch/x86/kernel/cpu/cpufreq/speedstep-lib.c | 1 - arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 1 - arch/x86/kernel/cpu/mcheck/mce-inject.c | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 1 + arch/x86/kernel/cpu/mcheck/mce_amd.c | 1 + arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + arch/x86/kernel/cpu/mtrr/generic.c | 1 - arch/x86/kernel/cpu/mtrr/if.c | 1 + arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpuid.c | 1 + arch/x86/kernel/crash_dump_32.c | 1 + arch/x86/kernel/hpet.c | 1 + arch/x86/kernel/i387.c | 1 + arch/x86/kernel/i8259.c | 1 - arch/x86/kernel/irqinit.c | 1 - arch/x86/kernel/k8.c | 2 +- arch/x86/kernel/kdebugfs.c | 1 + arch/x86/kernel/ldt.c | 1 + arch/x86/kernel/machine_kexec_64.c | 1 + arch/x86/kernel/mca_32.c | 1 + arch/x86/kernel/module.c | 1 + arch/x86/kernel/msr.c | 1 + arch/x86/kernel/pci-dma.c | 1 + arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/pci-nommu.c | 1 + arch/x86/kernel/ptrace.c | 1 + arch/x86/kernel/setup.c | 1 - arch/x86/kernel/smp.c | 1 + arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/tlb_uv.c | 1 + arch/x86/kernel/uv_irq.c | 1 + arch/x86/kernel/uv_time.c | 1 + arch/x86/kernel/vmi_32.c | 1 + arch/x86/kvm/i8254.c | 1 + arch/x86/kvm/i8259.c | 1 + arch/x86/kvm/lapic.c | 1 + arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/svm.c | 1 + arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + arch/x86/mm/hugetlbpage.c | 1 - arch/x86/mm/init.c | 1 + arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 1 + arch/x86/mm/kmmio.c | 1 + arch/x86/mm/mmio-mod.c | 1 + arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pat.c | 2 +- arch/x86/mm/pgtable.c | 1 + arch/x86/mm/pgtable_32.c | 1 - arch/x86/pci/acpi.c | 1 + arch/x86/pci/common.c | 1 + arch/x86/pci/irq.c | 1 - arch/x86/pci/mmconfig-shared.c | 1 + arch/x86/pci/pcbios.c | 1 + arch/x86/power/hibernate_32.c | 1 + arch/x86/power/hibernate_64.c | 1 + arch/x86/vdso/vma.c | 1 + arch/x86/xen/debugfs.c | 1 + arch/x86/xen/enlighten.c | 1 + arch/x86/xen/mmu.c | 1 + arch/x86/xen/smp.c | 1 + arch/x86/xen/spinlock.c | 1 + arch/x86/xen/time.c | 1 + 86 files changed, 70 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index daef6cd2b45d..1a8f8649c035 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -16,6 +16,7 @@ #include #include #include +#include #include struct crypto_fpu_ctx { diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 280c019cfad8..0350311906ae 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 74c35431b7d8..626be156d88d 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0061ea263061..cd40aba6aa95 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3a4bf35c179b..1a160d5d44d0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index adb0ba025702..f3dadb571d9b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -18,8 +18,8 @@ */ #include -#include #include +#include #include #include #include diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9dc91b431470..42f5350b908f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 4b7099526d2c..ff469e470059 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index dd2b5f264643..03ba1b895f5e 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 463de9a858ad..127b8718abfb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -36,6 +36,7 @@ #include #include #include /* time_after() */ +#include #ifdef CONFIG_ACPI #include #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 8aa65adbd25d..1edaf15c0b8e 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 49dbeaef2a27..c085d52dbaf2 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 30f25a75fe28..5de7f4c56971 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1b1920fa7c80..459168083b77 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 006b278b0d5d..c587db472a75 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index ac27ec2264d5..16e3483be9e3 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -80,6 +80,7 @@ #include #include #include +#include #include diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index da5f70fcb766..e7b559d74c52 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 869615193720..7b8a8ba67b07 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index ff36d2979a90..ce7cde713e71 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index cb01dac267d3..b3379d6a5c57 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 8d672ef162ce..9b1ff37de46a 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -20,6 +20,7 @@ #include /* current */ #include #include +#include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 2ce8e0b5cc54..561758e95180 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include "speedstep-lib.h" diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index ad0083abfa23..a94ec6be69fa 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index 04d73c114e49..8abd869baabf 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 73734baa50f2..e7dbde7bfedb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3ab9c886b613..8a6f0afa767e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index cda932ca3ade..224392d8fe8c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index d15df6e49bf0..62b48e40920a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -5,6 +5,7 @@ * Author: Andi Kleen */ +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9aa5dc76ff4a..fd31a441c61c 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -6,7 +6,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index e006e56f699c..79289632cb27 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #define LINE_SIZE 80 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 60398a0d947c..0316ffe851bd 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 83e5e628de73..8b862d5900fe 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index cd97ce18c29d..67414550c3cc 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -5,6 +5,7 @@ * Copyright (C) IBM Corporation, 2004. All rights reserved */ +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ee4fa1bfcb33..d10a7e7294f4 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index c01a2b846d47..54c31c285488 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index fb725ee15f55..7c9f02c130f3 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f01d390f9c5b..0ed2d300cd46 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index 9b895464dd03..0f7bc20cfcde 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -2,8 +2,8 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ -#include #include +#include #include #include #include diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index e444357375ce..8afd9f321f10 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ec6ef60cbd17..ea697263b373 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 4a8bb82248ae..035c8c529181 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 845d80ce1ef1..63eaf6596233 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 89f386f044e4..e0bc186d7501 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 206735ac8cbd..4d4468e9f47c 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a4ac764a6880..4b7e3d8b01dd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index f3af115a573a..68cd24f9deae 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 22be12b60a8f..3af4af810c07 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a503b1fd04e5..2e9b55027b7e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d7ba1a449bd..c08d1e3261a8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -55,7 +55,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index ec1de97600e7..d801210945d6 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 06d98ae5a802..be40f82b09af 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 364d015efebc..17b03dd3a6b5 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index ece73d8e3240..1d40336b030a 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -10,6 +10,7 @@ #include #include +#include #include #include diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 2b75ef638dbc..56e421bc379b 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -19,6 +19,7 @@ * Copyright (c) Dimitri Sivanich */ #include +#include #include #include diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 7dd599deca4a..ce9fbacb7526 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 294698b6daff..0150affad25d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -32,6 +32,7 @@ #define pr_fmt(fmt) "pit: " fmt #include +#include #include "irq.h" #include "i8254.h" diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 07771da85de5..a790fa128a9f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,6 +26,7 @@ * Port from Qemu. */ #include +#include #include #include "irq.h" diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b224f90087b..1eb7a4ae0c9c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 741373e8ca77..48aeee8eefb0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 52f78dd03010..445c59411ed0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -26,6 +26,7 @@ #include #include #include +#include #include diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 14873b9f8430..686492ed3079 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e46282a56565..24cd0ee896e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f46c340727b8..069ce7c37c01 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e71c5cbc8f35..a4a7d7dc8aa4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,3 +1,4 @@ +#include #include #include #include diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5cb3f0f54f47..bca79091b9d6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -25,11 +25,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e9b040e1cde5..ee41bba315d1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 536fb6823366..5d0e67fff1a6 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 34a3291ca103..3adff7dcc148 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index cf07c26d9a4a..28195c350b97 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -6,13 +6,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ae9648eb1c7f..edc8b95afc1a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c9ba9deafe83..5c4ee422590e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 46c8834aedc0..1a8faf09afed 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index e31160216efb..c7b1ebfb7da7 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 294e10cb11e1..cf2e93869c48 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 8b107521d24e..5d362b5ba06f 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 8f3f9a50b1e0..39b9ebe8f886 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 1c975cc9839e..59a225c17b84 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -4,6 +4,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 81197c62d5b3..3769079874d8 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -6,6 +6,7 @@ * Copyright (c) 2006 Rafael J. Wysocki */ +#include #include #include diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 65fdc86e923f..d24f983ba1e5 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -8,6 +8,7 @@ * Copyright (c) 2001 Patrick Mochel */ +#include #include #include #include diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 21e1aeb9f3ea..ac74869b8140 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index e133ce25e290..1304bcec8ee5 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -1,5 +1,6 @@ #include #include +#include #include #include "debugfs.h" diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b607239c1ba8..65d8d79b46a8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f9eb7de74f42..914f04695ce5 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index deafb65ef44e..a29693fd3138 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -14,6 +14,7 @@ */ #include #include +#include #include #include diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 24ded31b5aec..e0500646585d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -6,6 +6,7 @@ #include #include #include +#include #include diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0d3f07cd1b5f..32764b8880b5 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include -- cgit v1.2.2 From 8ae06d223f8203c72104e5c0c4ee49a000aedb42 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 5 Mar 2010 08:59:32 +0800 Subject: x86-32, resume: do a global tlb flush in S4 resume Colin King reported a strange oops in S4 resume code path (see below). The test system has i5/i7 CPU. The kernel doesn't open PAE, so 4M page table is used. The oops always happen a virtual address 0xc03ff000, which is mapped to the last 4k of first 4M memory. Doing a global tlb flush fixes the issue. EIP: 0060:[] EFLAGS: 00010086 CPU: 0 EIP is at copy_loop+0xe/0x15 EAX: 36aeb000 EBX: 00000000 ECX: 00000400 EDX: f55ad46c ESI: 0f800000 EDI: c03ff000 EBP: f67fbec4 ESP: f67fbea8 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 ... ... CR2: 00000000c03ff000 Tested-by: Colin Ian King Signed-off-by: Shaohua Li LKML-Reference: <20100305005932.GA22675@sli10-desk.sh.intel.com> Acked-by: Rafael J. Wysocki Signed-off-by: H. Peter Anvin Cc: --- arch/x86/power/hibernate_asm_32.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index b641388d8286..ad47daeafa4e 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -27,10 +27,17 @@ ENTRY(swsusp_arch_suspend) ret ENTRY(restore_image) + movl mmu_cr4_features, %ecx movl resume_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 + jecxz 1f # cr4 Pentium and higher, skip if zero + andl $~(X86_CR4_PGE), %ecx + movl %ecx, %cr4; # turn off PGE + movl %cr3, %eax; # flush TLB + movl %eax, %cr3 +1: movl restore_pblist, %edx .p2align 4,,7 @@ -54,16 +61,8 @@ done: movl $swapper_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 - /* Flush TLB, including "global" things (vmalloc) */ movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero - movl %ecx, %edx - andl $~(X86_CR4_PGE), %edx - movl %edx, %cr4; # turn off PGE -1: - movl %cr3, %eax; # flush TLB - movl %eax, %cr3 - jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %cr4; # turn PGE back on 1: -- cgit v1.2.2 From 9f3a5f52aa63d3aa4c64a7245153549bb66bad8c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 29 Mar 2010 22:38:29 -0700 Subject: x86: Make e820_remove_range to handle all covered case Rusty found on lguest with trim_bios_range, max_pfn is not right anymore, and looks e820_remove_range does not work right. [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] LGUEST: 0000000000000000 - 0000000004000000 (usable) [ 0.000000] Notice: NX (Execute Disable) protection missing in CPU or disabled in BIOS! [ 0.000000] DMI not present or invalid. [ 0.000000] last_pfn = 0x3fa0 max_arch_pfn = 0x100000 [ 0.000000] init_memory_mapping: 0000000000000000-0000000003fa0000 root cause is: the e820_remove_range doesn't handle the all covered case. e820_remove_range(BIOS_START, BIOS_END - BIOS_START, ...) produces a bogus range as a result. Make it match e820_update_range() by handling that case too. Reported-by: Rusty Russell Signed-off-by: Yinghai Lu Tested-by: Rusty Russell LKML-Reference: <4BB18E55.6090903@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 740b440fbd73..7bca3c6a02fb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -519,29 +519,45 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", (unsigned long long) start, (unsigned long long) end); - e820_print_type(old_type); + if (checktype) + e820_print_type(old_type); printk(KERN_CONT "\n"); for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; + u64 ei_end; if (checktype && ei->type != old_type) continue; + + ei_end = ei->addr + ei->size; /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { + if (ei->addr >= start && ei_end <= end) { real_removed_size += ei->size; memset(ei, 0, sizeof(struct e820entry)); continue; } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + e820_add_region(end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_removed_size += size; + continue; + } + /* partially covered */ final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); + final_end = min(end, ei_end); if (final_start >= final_end) continue; real_removed_size += final_end - final_start; + /* + * left range could be head or tail, so need to update + * size at first. + */ ei->size -= final_end - final_start; if (ei->addr < final_start) continue; -- cgit v1.2.2 From e49a5bd38159dfb1928fd25b173bc9de4bbadb21 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Mar 2010 19:40:03 +0100 Subject: perf: Use hot regs with software sched switch/migrate events Scheduler's task migration events don't work because they always pass NULL regs perf_sw_event(). The event hence gets filtered in perf_swevent_add(). Scheduler's context switches events use task_pt_regs() to get the context when the event occured which is a wrong thing to do as this won't give us the place in the kernel where we went to sleep but the place where we left userspace. The result is even more wrong if we switch from a kernel thread. Use the hot regs snapshot for both events as they belong to the non-interrupt/exception based events family. Unlike page faults or so that provide the regs matching the exact origin of the event, we need to save the current context. This makes the task migration event working and fix the context switch callchains and origin ip. Example: perf record -a -e cs Before: 10.91% ksoftirqd/0 0 [k] 0000000000000000 | --- (nil) perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_swevent_ctx_event do_perf_sw_event __perf_sw_event perf_event_task_sched_out schedule run_ksoftirqd kthread kernel_thread_helper After: 23.77% hald-addon-stor [kernel.kallsyms] [k] schedule | --- schedule | |--60.00%-- schedule_timeout | wait_for_common | wait_for_completion | blk_execute_rq | scsi_execute | scsi_execute_req | sr_test_unit_ready | | | |--66.67%-- sr_media_change | | media_changed | | cdrom_media_changed | | sr_block_media_changed | | check_disk_change | | cdrom_open v2: Always build perf_arch_fetch_caller_regs() now that software events need that too. They don't need it from modules, unlike trace events, so we keep the EXPORT_SYMBOL in trace_event_perf.c Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar Cc: David Miller --- arch/x86/kernel/cpu/perf_event.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 60398a0d947c..5fb490c6ee5c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1702,7 +1702,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -#ifdef CONFIG_EVENT_TRACING void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { regs->ip = ip; @@ -1714,4 +1713,3 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski regs->cs = __KERNEL_CS; local_save_flags(regs->flags); } -#endif -- cgit v1.2.2 From ab310b5edb8b601bcb02491ed6f7676da4fd1757 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 30 Mar 2010 14:05:07 -0500 Subject: x86,kgdb: Always initialize the hw breakpoint attribute It is required to call hw_breakpoint_init() on an attr before using it in any other calls. This fixes the problem where kgdb will sometimes fail to initialize on x86_64. Signed-off-by: Jason Wessel Cc: Ingo Molnar Cc: 2.6.33 LKML-Reference: <1269975907-27602-1-git-send-email-jason.wessel@windriver.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index bfba6019d762..b2258ca91003 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -618,8 +618,8 @@ int kgdb_arch_init(void) * portion of kgdb because this operation requires mutexs to * complete. */ + hw_breakpoint_init(&attr); attr.bp_addr = (unsigned long)kgdb_arch_init; - attr.type = PERF_TYPE_BREAKPOINT; attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; -- cgit v1.2.2 From 909fc87b32b3b9e3f0b87dcc5d98319c41900c58 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 29 Mar 2010 09:41:11 +0200 Subject: x86: Handle overlapping mptables We found a system where the MP table MPC and MPF structures overlap. That doesn't really matter because the mptable is not used anyways with ACPI, but it leads to a panic in the early allocator due to the overlapping reservations in 2.6.33. Earlier kernels handled this without problems. Simply change these reservations to reserve_early_overlap_ok to avoid the panic. Reported-by: Thomas Renninger Tested-by: Thomas Renninger Signed-off-by: Andi Kleen LKML-Reference: <20100329074111.GA22821@basil.fritz.box> Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/mpparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index a2c1edd2d3ac..e81030f71a8f 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -664,7 +664,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); - reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); + reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); } static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -693,7 +693,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); + reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); if (mpf->physptr) smp_reserve_memory(mpf); -- cgit v1.2.2 From 8da854cb02156c90028233ae1e85ce46a1d3f82c Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 25 Feb 2010 10:53:48 -0800 Subject: x86, hpet: Erratum workaround for read after write of HPET comparator On Wed, Feb 24, 2010 at 03:37:04PM -0800, Justin Piszcz wrote: > Hello, > > Again, on the Intel DP55KG board: > > # uname -a > Linux host 2.6.33 #1 SMP Wed Feb 24 18:31:00 EST 2010 x86_64 GNU/Linux > > [ 1.237600] ------------[ cut here ]------------ > [ 1.237890] WARNING: at arch/x86/kernel/hpet.c:404 hpet_next_event+0x70/0x80() > [ 1.238221] Hardware name: > [ 1.238504] hpet: compare register read back failed. > [ 1.238793] Modules linked in: > [ 1.239315] Pid: 0, comm: swapper Not tainted 2.6.33 #1 > [ 1.239605] Call Trace: > [ 1.239886] [] ? warn_slowpath_common+0x73/0xb0 > [ 1.240409] [] ? tick_dev_program_event+0x38/0xc0 > [ 1.240699] [] ? warn_slowpath_fmt+0x40/0x50 > [ 1.240992] [] ? tick_dev_program_event+0x38/0xc0 > [ 1.241281] [] ? hpet_next_event+0x70/0x80 > [ 1.241573] [] ? tick_dev_program_event+0x38/0xc0 > [ 1.241859] [] ? tick_handle_oneshot_broadcast+0xe2/0x100 > [ 1.246533] [] ? timer_interrupt+0x1a/0x30 > [ 1.246826] [] ? handle_IRQ_event+0x39/0xd0 > [ 1.247118] [] ? handle_edge_irq+0xb8/0x160 > [ 1.247407] [] ? handle_irq+0x15/0x20 > [ 1.247689] [] ? do_IRQ+0x62/0xe0 > [ 1.247976] [] ? ret_from_intr+0x0/0xa > [ 1.248262] [] ? mwait_idle+0x57/0x80 > [ 1.248796] [] ? cpu_idle+0x5c/0xb0 > [ 1.249080] ---[ end trace db7f668fb6fef4e1 ]--- > > Is this something Intel has to fix or is it a bug in the kernel? This is a chipset erratum. Thomas: You mentioned we can retain this check only for known-buggy and hpet debug kind of options. But here is the simple workaround patch for this particular erratum. Some chipsets have a erratum due to which read immediately following a write of HPET comparator returns old comparator value instead of most recently written value. Erratum 15 in "Intel I/O Controller Hub 9 (ICH9) Family Specification Update" (http://www.intel.com/assets/pdf/specupdate/316973.pdf) Workaround for the errata is to read the comparator twice if the first one fails. Signed-off-by: Venkatesh Pallipadi LKML-Reference: <20100225185348.GA9674@linux-os.sc.intel.com> Signed-off-by: H. Peter Anvin Cc: Venkatesh Pallipadi Cc: --- arch/x86/kernel/hpet.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ee4fa1bfcb33..3d422da92100 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -399,9 +399,15 @@ static int hpet_next_event(unsigned long delta, * then we might have a real hardware problem. We can not do * much about it here, but at least alert the user/admin with * a prominent warning. + * An erratum on some chipsets (ICH9,..), results in comparator read + * immediately following a write returning old value. Workaround + * for this is to read this value second time, when first + * read returns old value. */ - WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, + if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { + WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, KERN_WARNING "hpet: compare register read back failed.\n"); + } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } -- cgit v1.2.2 From b4a5e8a1deca7e61ebaffb37344766b0f0e9f327 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Thu, 11 Mar 2010 14:00:16 -0800 Subject: x86, hpet: Fix bug in RTC emulation We think there exists a bug in the HPET code that emulates the RTC. In the normal case, when the RTC frequency is set, the rtc driver tells the hpet code about it here: int hpet_set_periodic_freq(unsigned long freq) { uint64_t clc; if (!is_hpet_enabled()) return 0; if (freq <= DEFAULT_RTC_INT_FREQ) hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; else { clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; do_div(clc, freq); clc >>= hpet_clockevent.shift; hpet_pie_delta = (unsigned long) clc; } return 1; } If freq is set to 64Hz (DEFAULT_RTC_INT_FREQ) or lower, then hpet_pie_limit (a static) is set to non-zero. Then, on every one-shot HPET interrupt, hpet_rtc_timer_reinit is called to compute the next timeout. Well, that function has this logic: if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) delta = hpet_default_delta; else delta = hpet_pie_delta; Since hpet_pie_limit is not 0, hpet_default_delta is used. That corresponds to 64Hz. Now, if you set a different rtc frequency, you'll take the else path through hpet_set_periodic_freq, but unfortunately no one resets hpet_pie_limit back to 0. Boom....now you are stuck with 64Hz RTC interrupts forever. The patch below just resets the hpet_pie_limit value when requested freq is greater than DEFAULT_RTC_INT_FREQ, which we think fixes this problem. Signed-off-by: Alok N Kataria LKML-Reference: <201003112200.o2BM0Hre012875@imap1.linux-foundation.org> Signed-off-by: Daniel Hecht Cc: Venkatesh Pallipadi Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/hpet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3d422da92100..2bda5f0052f7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1149,6 +1149,7 @@ int hpet_set_periodic_freq(unsigned long freq) do_div(clc, freq); clc >>= hpet_clockevent.shift; hpet_pie_delta = clc; + hpet_pie_limit = 0; } return 1; } -- cgit v1.2.2 From 042be38e6106ed70b42d096ab4a1ed4187e510e6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 1 Apr 2010 14:32:43 -0700 Subject: ibft, x86: Change reserve_ibft_region() to find_ibft_region() This allows arch code could decide the way to reserve the ibft. And we should reserve ibft as early as possible, instead of BOOTMEM stage, in case the table is in RAM range and is not reserved by BIOS (this will often be the case.) Move to just after find_smp_config(). Also when CONFIG_NO_BOOTMEM=y, We will not have reserve_bootmem() anymore. -v2: fix typo about ibft pointed by Konrad Rzeszutek Wilk Signed-off-by: Yinghai Lu LKML-Reference: <4BB510FB.80601@kernel.org> Cc: Pekka Enberg Cc: Peter Jones Cc: Konrad Rzeszutek Wilk CC: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d76e18570c60..580e6b3dbdb8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -608,6 +608,16 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif +static __init void reserve_ibft_region(void) +{ + unsigned long addr, size = 0; + + addr = find_ibft_region(&size); + + if (size) + reserve_early_overlap_ok(addr, addr + size, "ibft"); +} + #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -910,6 +920,8 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); + reserve_ibft_region(); + reserve_trampoline_memory(); #ifdef CONFIG_ACPI_SLEEP @@ -977,8 +989,6 @@ void __init setup_arch(char **cmdline_p) dma32_reserve_bootmem(); - reserve_ibft_region(); - #ifdef CONFIG_KVM_CLOCK kvmclock_init(); #endif -- cgit v1.2.2 From 51591e31dcb3716f03f962e26ec36a029aa46340 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 25 Mar 2010 15:39:27 -0700 Subject: x86: Increase CONFIG_NODES_SHIFT max to 10 Some larger systems require more than 512 nodes, so increase the maximum CONFIG_NODES_SHIFT to 10 for a new max of 1024 nodes. This was tested with numa=fake=64M on systems with more than 64GB of RAM. A total of 1022 nodes were initialized. Successfully builds with no additional warnings on x86_64 allyesconfig. ( No effect on any existing config. Newly enabled CONFIG_MAXSMP=y will see the new default. ) Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0eacb1ffb421..9458685902bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1216,8 +1216,8 @@ config NUMA_EMU config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP - range 1 9 - default "9" if MAXSMP + range 1 10 + default "10" if MAXSMP default "6" if X86_64 default "4" if X86_NUMAQ default "3" -- cgit v1.2.2 From 85257024096a96fc5c00ce59d685f62bbed3ad95 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Mar 2010 19:30:52 +0100 Subject: x86: Move notify_cpu_starting() callback to a later stage Because we need to have cpu identification things done by the time we run CPU_STARTING notifiers. ( This init ordering will be relied on by the next fix. ) Signed-off-by: Peter Zijlstra LKML-Reference: <1269353485.5109.48.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 06d98ae5a802..6808b934d6c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -242,8 +242,6 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); - notify_cpu_starting(cpuid); - /* * Need to setup vector mappings before we enable interrupts. */ @@ -264,6 +262,8 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + notify_cpu_starting(cpuid); + /* * Allow the master to continue. */ -- cgit v1.2.2 From b38b24ead33417146e051453d04bf60b8d2d7e25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Mar 2010 19:31:15 +0100 Subject: perf, x86: Fix AMD hotplug & constraint initialization Commit 3f6da39 ("perf: Rework and fix the arch CPU-hotplug hooks") moved the amd northbridge allocation from CPUS_ONLINE to CPUS_PREPARE_UP however amd_nb_id() doesn't work yet on prepare so it would simply bail basically reverting to a state where we do not properly track node wide constraints - causing weird perf results. Fix up the AMD NorthBridge initialization code by allocating from CPU_UP_PREPARE and installing it from CPU_STARTING once we have the proper nb_id. It also properly deals with the allocation failing. Signed-off-by: Peter Zijlstra [ robustify using amd_has_nb() ] Signed-off-by: Stephane Eranian LKML-Reference: <1269353485.5109.48.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 8 ++-- arch/x86/kernel/cpu/perf_event_amd.c | 80 +++++++++++++++++++++--------------- 2 files changed, 52 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5fb490c6ee5c..bd28cf9d8a82 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -158,7 +158,7 @@ struct x86_pmu { struct perf_event *event); struct event_constraint *event_constraints; - void (*cpu_prepare)(int cpu); + int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); @@ -1333,11 +1333,12 @@ static int __cpuinit x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; + int ret = NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: if (x86_pmu.cpu_prepare) - x86_pmu.cpu_prepare(cpu); + ret = x86_pmu.cpu_prepare(cpu); break; case CPU_STARTING: @@ -1350,6 +1351,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) x86_pmu.cpu_dying(cpu); break; + case CPU_UP_CANCELED: case CPU_DEAD: if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); @@ -1359,7 +1361,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; } - return NOTIFY_OK; + return ret; } static void __init pmu_check_apic(void) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index b87e0b6970cb..db6f7d4056e1 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -137,6 +137,13 @@ static inline int amd_is_nb_event(struct hw_perf_event *hwc) return (hwc->config & 0xe0) == 0xe0; } +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + static void amd_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -147,7 +154,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, /* * only care about NB events */ - if (!(nb && amd_is_nb_event(hwc))) + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) return; /* @@ -214,7 +221,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) /* * if not NB event or no NB, then no constraints */ - if (!(nb && amd_is_nb_event(hwc))) + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) return &unconstrained; /* @@ -293,51 +300,55 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) return nb; } -static void amd_pmu_cpu_online(int cpu) +static int amd_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + WARN_ON_ONCE(cpuc->amd_nb); + + if (boot_cpu_data.x86_max_cores < 2) + return NOTIFY_OK; + + cpuc->amd_nb = amd_alloc_nb(cpu, -1); + if (!cpuc->amd_nb) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void amd_pmu_cpu_starting(int cpu) { - struct cpu_hw_events *cpu1, *cpu2; - struct amd_nb *nb = NULL; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct amd_nb *nb; int i, nb_id; if (boot_cpu_data.x86_max_cores < 2) return; - /* - * function may be called too early in the - * boot process, in which case nb_id is bogus - */ nb_id = amd_get_nb_id(cpu); - if (nb_id == BAD_APICID) - return; - - cpu1 = &per_cpu(cpu_hw_events, cpu); - cpu1->amd_nb = NULL; + WARN_ON_ONCE(nb_id == BAD_APICID); raw_spin_lock(&amd_nb_lock); for_each_online_cpu(i) { - cpu2 = &per_cpu(cpu_hw_events, i); - nb = cpu2->amd_nb; - if (!nb) + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) continue; - if (nb->nb_id == nb_id) - goto found; - } - nb = amd_alloc_nb(cpu, nb_id); - if (!nb) { - pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); - raw_spin_unlock(&amd_nb_lock); - return; + if (nb->nb_id == nb_id) { + kfree(cpuc->amd_nb); + cpuc->amd_nb = nb; + break; + } } -found: - nb->refcnt++; - cpu1->amd_nb = nb; + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; raw_spin_unlock(&amd_nb_lock); } -static void amd_pmu_cpu_offline(int cpu) +static void amd_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuhw; @@ -349,8 +360,10 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_lock(&amd_nb_lock); if (cpuhw->amd_nb) { - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); cpuhw->amd_nb = NULL; } @@ -379,8 +392,9 @@ static __initconst struct x86_pmu amd_pmu = { .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, - .cpu_prepare = amd_pmu_cpu_online, - .cpu_dead = amd_pmu_cpu_offline, + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, }; static __init int amd_pmu_init(void) -- cgit v1.2.2 From 257ef9d21f1b008a6c7425544b36641c4325a922 Mon Sep 17 00:00:00 2001 From: Torok Edwin Date: Wed, 17 Mar 2010 12:07:16 +0200 Subject: perf, x86: Fix callgraphs of 32-bit processes on 64-bit kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When profiling a 32-bit process on a 64-bit kernel, callgraph tracing stopped after the first function, because it has seen a garbage memory address (tried to interpret the frame pointer, and return address as a 64-bit pointer). Fix this by using a struct stack_frame with 32-bit pointers when the TIF_IA32 flag is set. Note that TIF_IA32 flag must be used, and not is_compat_task(), because the latter is only set when the 32-bit process is executing a syscall, which may not always be the case (when tracing page fault events for example). Signed-off-by: Török Edwin Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Paul Mackerras Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org LKML-Reference: <1268820436-13145-1-git-send-email-edwintorok@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 44 +++++++++++++++++++++++++++++++++++----- arch/x86/kernel/dumpstack.h | 5 +++++ 2 files changed, 44 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bd28cf9d8a82..53ea4cf1a878 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -28,6 +28,7 @@ #include #include #include +#include static u64 perf_event_mask __read_mostly; @@ -1630,14 +1631,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +#ifdef CONFIG_COMPAT +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bytes; + /* 32-bit process in 64-bit kernel. */ + struct stack_frame_ia32 frame; + const void __user *fp; + + if (!test_thread_flag(TIF_IA32)) + return 0; + + fp = compat_ptr(regs->bp); + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = 0; + frame.return_address = 0; - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) + break; + + if (fp < compat_ptr(regs->sp)) + break; - return bytes == sizeof(*frame); + callchain_store(entry, frame.return_address); + fp = compat_ptr(frame.next_frame); + } + return 1; } +#else +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + return 0; +} +#endif static void perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) @@ -1653,11 +1682,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); + if (perf_callchain_user32(regs, entry)) + return; + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) break; if ((unsigned long)fp < regs->sp) diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 29e5f7c845b2..e39e77168a37 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -30,6 +30,11 @@ struct stack_frame { unsigned long return_address; }; +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + static inline unsigned long rewind_frame_pointer(int n) { struct stack_frame *frame; -- cgit v1.2.2 From 472a474c6630efd195d3738339fd1bdc8aa3b1aa Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 31 Mar 2010 18:04:47 -0700 Subject: x86: Fix double enable_IR_x2apic() call on SMP kernel on !SMP boards Jan Grossmann reported kernel boot panic while booting SMP kernel on his system with a single core cpu. SMP kernels call enable_IR_x2apic() from native_smp_prepare_cpus() and on platforms where the kernel doesn't find SMP configuration we ended up again calling enable_IR_x2apic() from the APIC_init_uniprocessor() call in the smp_sanity_check(). Thus leading to kernel panic. Don't call enable_IR_x2apic() and default_setup_apic_routing() from APIC_init_uniprocessor() in CONFIG_SMP case. NOTE: this kind of non-idempotent and assymetric initialization sequence is rather fragile and unclean, we'll clean that up in v2.6.35. This is the minimal fix for v2.6.34. Reported-by: Jan.Grossmann@kielnet.net Signed-off-by: Suresh Siddha Cc: Cc: Cc: Cc: Cc: Cc: # [v2.6.32.x, v2.6.33.x] LKML-Reference: <1270083887.7835.78.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 00187f1fcfb7..e5a4a1e01618 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1640,8 +1640,10 @@ int __init APIC_init_uniprocessor(void) } #endif +#ifndef CONFIG_SMP enable_IR_x2apic(); default_setup_apic_routing(); +#endif verify_local_APIC(); connect_bsp_APIC(); -- cgit v1.2.2 From 134fbadf028a5977a1b06b0253d3ee33e6f0c642 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Tue, 6 Apr 2010 10:01:19 -0400 Subject: perf, x86: Enable Nehalem-EX support According to Intel Software Devel Manual Volume 3B, the Nehalem-EX PMU is just like regular Nehalem (except for the uncore support, which is completely different). Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "H. Peter Anvin" Cc: Arjan van de Ven Cc: Lin Ming LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 84bfde64a337..9c794ac87837 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -936,6 +936,7 @@ static __init int intel_pmu_init(void) case 26: /* 45 nm nehalem, "Bloomfield" */ case 30: /* 45 nm nehalem, "Lynnfield" */ + case 46: /* 45 nm nehalem-ex, "Beckton" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.2.2 From 75f66533bc883f761a7adcab3281fe3323efbc90 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Fri, 2 Apr 2010 18:27:52 -0700 Subject: x86/amd-iommu: enable iommu before attaching devices Hit another kdump problem as reported by Neil Horman. When initializaing the IOMMU, we attach devices to their domains before the IOMMU is fully (re)initialized. Attaching a device will issue some important invalidations. In the context of the newly kexec'd kdump kernel, the IOMMU may have stale cached data from the original kernel. Because we do the attach too early, the invalidation commands are placed in the new command buffer before the IOMMU is updated w/ that buffer. This leaves the stale entries in the kdump context and can renders device unusable. Simply enable the IOMMU before we do the attach. Cc: stable@kernel.org Cc: Neil Horman Cc: Vivek Goyal Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index feaf47184900..8975965f3e67 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1304,6 +1304,8 @@ static int __init amd_iommu_init(void) if (ret) goto free; + enable_iommus(); + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else @@ -1316,8 +1318,6 @@ static int __init amd_iommu_init(void) amd_iommu_init_notifier(); - enable_iommus(); - if (iommu_pass_through) goto out; @@ -1331,6 +1331,7 @@ out: return ret; free: + disable_iommus(); amd_iommu_uninit_devices(); -- cgit v1.2.2 From 549c90dc9a6d659e792b2a42a0930c7da015ea4a Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Fri, 2 Apr 2010 18:27:53 -0700 Subject: x86/amd-iommu: warn when issuing command to uninitialized cmd buffer To catch future potential issues we can add a warning whenever we issue a command before the command buffer is fully initialized. Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 1 + arch/x86/kernel/amd_iommu.c | 1 + arch/x86/kernel/amd_iommu_init.c | 5 +++-- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 5e46e78f3b1b..86a0ff0aeac7 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -141,6 +141,7 @@ /* constants to configure the command buffer */ #define CMD_BUFFER_SIZE 8192 +#define CMD_BUFFER_UNINITIALIZED 1 #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b06f29e275e9..71dfc0af8e50 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) u32 tail, head; u8 *target; + WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8975965f3e67..5edf41c7127c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -438,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) if (cmd_buf == NULL) return NULL; - iommu->cmd_buf_size = CMD_BUFFER_SIZE; + iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED; return cmd_buf; } @@ -474,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) &entry, sizeof(entry)); amd_iommu_reset_cmd_buffer(iommu); + iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED); } static void __init free_command_buffer(struct amd_iommu *iommu) { free_pages((unsigned long)iommu->cmd_buf, - get_order(iommu->cmd_buf_size)); + get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED))); } /* allocates the memory where the IOMMU will log its events to */ -- cgit v1.2.2 From 8f9f55e83e939724490d7cde3833c4883c6d1310 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Fri, 2 Apr 2010 18:27:54 -0700 Subject: Revert "x86: disable IOMMUs on kernel crash" This effectively reverts commit 61d047be99757fd9b0af900d7abce9a13a337488. Disabling the IOMMU can potetially allow DMA transactions to complete without being translated. Leave it enabled, and allow crash kernel to do the IOMMU reinitialization properly. Cc: stable@kernel.org Cc: Joerg Roedel Cc: Eric Biederman Cc: Neil Horman Cc: Vivek Goyal Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/crash.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a4849c10a77e..ebd4c51d096a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,7 +27,6 @@ #include #include #include -#include #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif - -#ifdef CONFIG_X86_64 - x86_platform.iommu_shutdown(); -#endif - crash_save_cpu(regs, safe_smp_processor_id()); } -- cgit v1.2.2 From d18c69d3898985c66cd6e878b8f576fd9a21ab39 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Fri, 2 Apr 2010 18:27:55 -0700 Subject: x86/amd-iommu: use for_each_pci_dev Replace open coded version with for_each_pci_dev Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 71dfc0af8e50..494956813951 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2187,7 +2187,7 @@ static void prealloc_protection_domains(void) struct dma_ops_domain *dma_dom; u16 devid; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) { /* Do we handle this device? */ if (!check_device(&dev->dev)) -- cgit v1.2.2 From 4b83873d3da0704987cb116833818ed96214ee29 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 7 Apr 2010 12:57:35 +0200 Subject: x86/gart: Disable GART explicitly before initialization If we boot into a crash-kernel the gart might still be enabled and its caches might be dirty. This can result in undefined behavior later. Fix it by explicitly disabling the gart hardware before initialization and flushing the caches after enablement. Signed-off-by: Joerg Roedel --- arch/x86/kernel/aperture_64.c | 15 ++++++++++++++- arch/x86/kernel/pci-gart_64.c | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3704997e8b25..b5d8b0bcf235 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -393,6 +393,7 @@ void __init gart_iommu_hole_init(void) for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; int dev_base, dev_limit; + u32 ctl; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -406,7 +407,19 @@ void __init gart_iommu_hole_init(void) gart_iommu_aperture = 1; x86_init.iommu.iommu_init = gart_iommu_init; - aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); + + /* + * Before we do anything else disable the GART. It may + * still be enabled if we boot into a crash-kernel here. + * Reconfiguring the GART while it is enabled could have + * unknown side-effects. + */ + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + + aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index f3af115a573a..0ae24d9b44b3 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -564,6 +564,9 @@ static void enable_gart_translations(void) enable_gart_translation(dev, __pa(agp_gatt_table)); } + + /* Flush the GART-TLB to remove stale entries */ + k8_flush_garts(); } /* -- cgit v1.2.2 From 73a0e614580fb650846be1e9315f6b7b6069b9cc Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 6 Apr 2010 13:24:08 -0600 Subject: x86/PCI: ignore Consumer/Producer bit in ACPI window descriptions ACPI Address Space Descriptors (used in _CRS) have a Consumer/Producer bit that is supposed to distinguish regions that are consumed directly by a device from those that are forwarded ("produced") by a bridge. But BIOSes have apparently not used this consistently, and Windows seems to ignore it, so I think Linux should ignore it as well. I can't point to any of these supposed broken BIOSes, but since we now rely on _CRS by default, I think it's safer to ignore this bit from the start. Here are details of my experiments with how Windows handles it: https://bugzilla.kernel.org/show_bug.cgi?id=15701 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c7b1ebfb7da7..334153ca4c30 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -71,8 +71,7 @@ resource_to_addr(struct acpi_resource *resource, if (ACPI_SUCCESS(status) && (addr->resource_type == ACPI_MEMORY_RANGE || addr->resource_type == ACPI_IO_RANGE) && - addr->address_length > 0 && - addr->producer_consumer == ACPI_PRODUCER) { + addr->address_length > 0) { return AE_OK; } return AE_ERROR; -- cgit v1.2.2 From ab285f2b5290d92b7ec1a6f9aad54308dadf6157 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 8 Apr 2010 14:05:50 +0200 Subject: perf: Fix unsafe frame rewinding with hot regs fetching When we fetch the hot regs and rewind to the nth caller, it might happen that we dereference a frame pointer outside the kernel stack boundaries, like in this example: perf_trace_sched_switch+0xd5/0x120 schedule+0x6b5/0x860 retint_careful+0xd/0x21 Since we directly dereference a userspace frame pointer here while rewinding behind retint_careful, this may end up in a crash. Fix this by simply using probe_kernel_address() when we rewind the frame pointer. This issue will have a much more proper fix in the next version of the perf_arch_fetch_caller_regs() API that will only need to rewind to the first caller. Reported-by: Eric Dumazet Signed-off-by: Frederic Weisbecker Tested-by: Eric Dumazet Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: David Miller Cc: Archs --- arch/x86/kernel/dumpstack.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index e39e77168a37..e1a93be4fd44 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -14,6 +14,8 @@ #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif +#include + extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl); @@ -42,8 +44,10 @@ static inline unsigned long rewind_frame_pointer(int n) get_bp(frame); #ifdef CONFIG_FRAME_POINTER - while (n--) - frame = frame->next_frame; + while (n--) { + if (probe_kernel_address(&frame->next_frame, frame)) + break; + } #endif return (unsigned long)frame; -- cgit v1.2.2 From 091ebf07a2408f9a56634caa0f86d9360e9af23b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 14 Apr 2010 21:43:54 -0600 Subject: lguest: stop using KVM hypercall mechanism This is a partial revert of 4cd8b5e2a159 "lguest: use KVM hypercalls"; we revert to using (just as questionable but more reliable) int $15 for hypercalls. I didn't revert the register mapping, so we still use the same calling convention as kvm. KVM in more recent incarnations stopped injecting a fault when a guest tried to use the VMCALL instruction from ring 1, so lguest under kvm fails to make hypercalls. It was nice to share code with our KVM cousins, but this was overreach. Signed-off-by: Rusty Russell Cc: Matias Zabaljauregui Cc: Avi Kivity --- arch/x86/include/asm/lguest_hcall.h | 29 ++++++++++++++---- arch/x86/lguest/boot.c | 61 ++++++++++++++++++------------------- arch/x86/lguest/i386_head.S | 2 +- 3 files changed, 54 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index ba0eed8aa1a6..b60f2924c413 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -28,22 +28,39 @@ #ifndef __ASSEMBLY__ #include -#include /*G:030 * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism, though completely different hypercall - * numbers. Seventeen hypercalls are available: the hypercall number is put in - * the %eax register, and the arguments (when required) are placed in %ebx, - * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. + * Our hypercall mechanism uses the highest unused trap code (traps 32 and + * above are used by real hardware interrupts). Seventeen hypercalls are + * available: the hypercall number is put in the %eax register, and the + * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. + * If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's * definition of a gentleman: "someone who is only rude intentionally". -:*/ + */ +static inline unsigned long +hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4) +{ + /* "int" is the Intel instruction to trigger a trap. */ + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) + /* The call in %eax (aka "a") might be overwritten */ + : "=a"(call) + /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ + : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) + /* "memory" means this might write somewhere in memory. + * This isn't true for all calls, but it's safe to tell + * gcc that it might happen so it doesn't get clever. */ + : "memory"); + return call; +} /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7e59dc1d3fc2..2bdf628066bd 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -115,7 +115,7 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall4(call, arg1, arg2, arg3, arg4); + hcall(call, arg1, arg2, arg3, arg4); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; @@ -145,46 +145,45 @@ static void async_hcall(unsigned long call, unsigned long arg1, * So, when we're in lazy mode, we call async_hcall() to store the call for * future processing: */ -static void lazy_hcall1(unsigned long call, - unsigned long arg1) +static void lazy_hcall1(unsigned long call, unsigned long arg1) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall1(call, arg1); + hcall(call, arg1, 0, 0, 0); else async_hcall(call, arg1, 0, 0, 0); } /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) + unsigned long arg1, + unsigned long arg2) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall2(call, arg1, arg2); + hcall(call, arg1, arg2, 0, 0); else async_hcall(call, arg1, arg2, 0, 0); } static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) + unsigned long arg1, + unsigned long arg2, + unsigned long arg3) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall3(call, arg1, arg2, arg3); + hcall(call, arg1, arg2, arg3, 0); else async_hcall(call, arg1, arg2, arg3, 0); } #ifdef CONFIG_X86_PAE static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall4(call, arg1, arg2, arg3, arg4); + hcall(call, arg1, arg2, arg3, arg4); else async_hcall(call, arg1, arg2, arg3, arg4); } @@ -196,13 +195,13 @@ static void lazy_hcall4(unsigned long call, :*/ static void lguest_leave_lazy_mmu_mode(void) { - kvm_hypercall0(LHCALL_FLUSH_ASYNC); + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_leave_lazy_mmu(); } static void lguest_end_context_switch(struct task_struct *next) { - kvm_hypercall0(LHCALL_FLUSH_ASYNC); + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_end_context_switch(next); } @@ -286,7 +285,7 @@ static void lguest_write_idt_entry(gate_desc *dt, /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); /* Tell Host about this new entry. */ - kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); + hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); } /* @@ -300,7 +299,7 @@ static void lguest_load_idt(const struct desc_ptr *desc) struct desc_struct *idt = (void *)desc->address; for (i = 0; i < (desc->size+1)/8; i++) - kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); + hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); } /* @@ -321,7 +320,7 @@ static void lguest_load_gdt(const struct desc_ptr *desc) struct desc_struct *gdt = (void *)desc->address; for (i = 0; i < (desc->size+1)/8; i++) - kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); + hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); } /* @@ -334,8 +333,8 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, { native_write_gdt_entry(dt, entrynum, desc, type); /* Tell Host about this new entry. */ - kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b); + hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, + dt[entrynum].a, dt[entrynum].b, 0); } /* @@ -931,7 +930,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta, } /* Please wake us this far in the future. */ - kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta); + hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); return 0; } @@ -942,7 +941,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode, case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: /* A 0 argument shuts the clock down. */ - kvm_hypercall0(LHCALL_SET_CLOCKEVENT); + hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); break; case CLOCK_EVT_MODE_ONESHOT: /* This is what we expect. */ @@ -1100,7 +1099,7 @@ static void set_lguest_basic_apic_ops(void) /* STOP! Until an interrupt comes in. */ static void lguest_safe_halt(void) { - kvm_hypercall0(LHCALL_HALT); + hcall(LHCALL_HALT, 0, 0, 0, 0); } /* @@ -1112,8 +1111,8 @@ static void lguest_safe_halt(void) */ static void lguest_power_off(void) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF); + hcall(LHCALL_SHUTDOWN, __pa("Power down"), + LGUEST_SHUTDOWN_POWEROFF, 0, 0); } /* @@ -1123,7 +1122,7 @@ static void lguest_power_off(void) */ static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF); + hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); /* The hcall won't return, but to keep gcc happy, we're "done". */ return NOTIFY_DONE; } @@ -1162,7 +1161,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) len = sizeof(scratch) - 1; scratch[len] = '\0'; memcpy(scratch, buf, len); - kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch)); + hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0); /* This routine returns the number of bytes actually written. */ return len; @@ -1174,7 +1173,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) */ static void lguest_restart(char *reason) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); + hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); } /*G:050 diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 27eac0faee48..4f420c2f2d55 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -32,7 +32,7 @@ ENTRY(lguest_entry) */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx - .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + int $LGUEST_TRAP_ENTRY /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp -- cgit v1.2.2 From 7567cae105e435b53e5a3e778546dd3ec53e3204 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 9 Mar 2010 12:01:10 +0200 Subject: KVM: take srcu lock before call to complete_pio() complete_pio() may use slot table which is protected by srcu. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 24cd0ee896e9..2eb999dc9774 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4483,7 +4483,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_set_cr8(vcpu, kvm_run->cr8); if (vcpu->arch.pio.cur_count) { + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = complete_pio(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r) goto out; } -- cgit v1.2.2 From b7af40433870aa0636932ad39b0c48a0cb319057 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 9 Mar 2010 14:55:19 +0900 Subject: KVM: SVM: Fix memory leaks that happen when svm_create_vcpu() fails svm_create_vcpu() does not free the pages allocated during the creation when it fails to complete the allocations. This patch fixes it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 445c59411ed0..2ba58206812a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -706,29 +706,28 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_svm; + err = -ENOMEM; page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; + if (!page) goto uninit; - } - err = -ENOMEM; msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) - goto uninit; + goto free_page1; nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) - goto uninit; - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + goto free_page2; hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) - goto uninit; + goto free_page3; + svm->nested.hsave = page_address(hsave_page); + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); @@ -744,6 +743,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: -- cgit v1.2.2 From d6a23895aa82353788a1cc5a1d9a1c963465463e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 11 Mar 2010 12:20:03 +0200 Subject: KVM: Don't spam kernel log when injecting exceptions due to bad cr writes These are guest-triggerable. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2eb999dc9774..8f9b08d72c4d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -433,8 +433,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #ifdef CONFIG_X86_64 if (cr0 & 0xffffffff00000000UL) { - printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } @@ -443,14 +441,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - printk(KERN_DEBUG "set_cr0: #GP, set PG flag " - "and a clear PE flag\n"); kvm_inject_gp(vcpu, 0); return; } @@ -461,15 +456,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) int cs_db, cs_l; if (!is_pae(vcpu)) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while PAE is disabled\n"); kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while CS.L == 1\n"); kvm_inject_gp(vcpu, 0); return; @@ -477,8 +468,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } else #endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr0: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -505,28 +494,23 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) { - printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " - "in long mode\n"); kvm_inject_gp(vcpu, 0); return; } } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (cr4 & X86_CR4_VMXE) { - printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; } @@ -547,21 +531,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -593,7 +572,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); kvm_inject_gp(vcpu, 0); return; } @@ -649,15 +627,12 @@ static u32 emulated_msrs[] = { static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); kvm_inject_gp(vcpu, 0); return; } @@ -667,7 +642,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); kvm_inject_gp(vcpu, 0); return; } @@ -678,7 +652,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); kvm_inject_gp(vcpu, 0); return; } -- cgit v1.2.2 From 114be429c8cd44e57f312af2bbd6734e5a185b0d Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 24 Mar 2010 17:46:42 +0100 Subject: KVM: allow bit 10 to be cleared in MSR_IA32_MC4_CTL There is a quirk for AMD K8 CPUs in many Linux kernels (see arch/x86/kernel/cpu/mcheck/mce.c:__mcheck_cpu_apply_quirks()) that clears bit 10 in that MCE related MSR. KVM can only cope with all zeros or all ones, so it will inject a #GP into the guest, which will let it panic. So lets add a quirk to the quirk and ignore this single cleared bit. This fixes -cpu kvm64 on all machines and -cpu host on K8 machines with some guest Linux kernels. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f9b08d72c4d..9ad3d064c781 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -940,9 +940,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MC0_CTL + 4 * bank_num) { u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL */ + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ if ((offset & 0x3) == 0 && - data != 0 && data != ~(u64)0) + data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; vcpu->arch.mce_banks[offset] = data; break; -- cgit v1.2.2 From 78ac8b47c566dd6177a3b9b291b756ccb70670b7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 8 Apr 2010 18:19:35 +0300 Subject: KVM: VMX: Save/restore rflags.vm correctly in real mode Currently we set eflags.vm unconditionally when entering real mode emulation through virtual-8086 mode, and clear it unconditionally when we enter protected mode. The means that the following sequence KVM_SET_REGS (rflags.vm=1) KVM_SET_SREGS (cr0.pe=1) Ends up with rflags.vm clear due to KVM_SET_SREGS triggering enter_pmode(). Fix by shadowing rflags.vm (and rflags.iopl) correctly while in real mode: reads and writes to those bits access a shadow register instead of the actual register. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 686492ed3079..bc933cfb4e66 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -77,6 +77,8 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -131,7 +133,7 @@ struct vcpu_vmx { } host_state; struct { int vm86_active; - u8 save_iopl; + ulong save_rflags; struct kvm_save_segment { u16 selector; unsigned long base; @@ -818,18 +820,23 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + unsigned long rflags, save_rflags; rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) - rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (to_vmx(vcpu)->rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } vmcs_writel(GUEST_RFLAGS, rflags); } @@ -1483,8 +1490,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1557,8 +1564,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; -- cgit v1.2.2 From 77662e0028c7c63e34257fda03ff9625c59d939d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Apr 2010 16:34:42 +0800 Subject: KVM: MMU: fix kvm_mmu_zap_page() and its calling path This patch fix: - calculate zapped page number properly in mmu_zap_unsync_children() - calculate freeed page number properly kvm_mmu_change_mmu_pages() - if zapped children page it shoud restart hlist walking KVM-Stable-Tag. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 48aeee8eefb0..19a8906bcaa2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1490,8 +1490,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, for_each_sp(pages, sp, parents, i) { kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); + zapped++; } - zapped += pages.nr; kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1542,14 +1542,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) */ if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages && + !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_zap_page(kvm, page); used_pages--; } + kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } else @@ -1596,7 +1598,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + nn = bucket->first; } } } -- cgit v1.2.2 From 87bf6e7de1134f48681fd2ce4b7c1ec45458cb6d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 12 Apr 2010 19:35:35 +0900 Subject: KVM: fix the handling of dirty bitmaps to avoid overflows Int is not long enough to store the size of a dirty bitmap. This patch fixes this problem with the introduction of a wrapper function to calculate the sizes of dirty bitmaps. Note: in mark_page_dirty(), we have to consider the fact that __set_bit() takes the offset as int, not long. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ad3d064c781..45aa90f8cc57 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2612,8 +2612,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r, n, i; + int r, i; struct kvm_memory_slot *memslot; + unsigned long n; unsigned long is_dirty = 0; unsigned long *dirty_bitmap = NULL; @@ -2628,7 +2629,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (!memslot->dirty_bitmap) goto out; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); r = -ENOMEM; dirty_bitmap = vmalloc(n); -- cgit v1.2.2 From 4cecd935f67bf46a9fe8037c710dd86651fcafe4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Apr 2010 05:31:02 +0200 Subject: x86: correctly wire up the newuname system call Before commit e28cbf22933d0c0ccaf3c4c27a1a263b41f73859 ("improve sys_newuname() for compat architectures") 64-bit x86 had a private implementation of sys_uname which was just called sys_uname, which other architectures used for the old uname. Due to some merge issues with the uname refactoring patches we ended up calling the old uname version for both the old and new system call slots, which lead to the domainname filed never be set which caused failures with libnss_nis. Reported-and-tested-by: Andy Isaacson Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 59b4556a5b92..e790bc1fbfa3 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -626,7 +626,7 @@ ia32_sys_call_table: .quad stub32_sigreturn .quad stub32_clone /* 120 */ .quad sys_setdomainname - .quad sys_uname + .quad sys_newuname .quad sys_modify_ldt .quad compat_sys_adjtimex .quad sys32_mprotect /* 125 */ -- cgit v1.2.2 From ae7c9b70dcb4313ea3dbcc9a2f240dae6c2b50c0 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 19 Apr 2010 11:23:43 -0700 Subject: x86, mrst: Conditionally register cpu hotplug notifier for apbt APB timer is used on Moorestown platforms but not on a standard PC. If APB timer code is compiled in but not initialized at run-time due to lack of FW reported SFI table, kernel would panic when the non-boot CPUs are offlined and notifier is called. https://bugzilla.kernel.org/show_bug.cgi?id=15786 This patch ensures CPU hotplug notifier for APB timer is only registered when the APBT timer block is initialized. Signed-off-by: Jacob Pan LKML-Reference: <1271701423-1162-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apb_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index ff469e470059..a35347501d36 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -429,7 +429,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (disable_apbt_percpu) + if (disable_apbt_percpu || !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ hotcpu_notifier(apbt_cpuhp_notify, -20); -- cgit v1.2.2 From e8861cfe2c75bdce36655b64d7ce02c2b31b604d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Apr 2010 16:57:11 +0200 Subject: KVM: x86: Fix TSS size check for 16-bit tasks A 16-bit TSS is only 44 bytes long. So make sure to test for the correct size on task switch. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 45aa90f8cc57..3c4ca98ad27f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5126,6 +5126,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) int ret = 0; u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + u32 desc_limit; old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); @@ -5148,7 +5149,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } } - if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { + desc_limit = get_desc_limit(&nseg_desc); + if (!nseg_desc.p || + ((desc_limit < 0x67 && (nseg_desc.type & 8)) || + desc_limit < 0x2b)) { kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); return 1; } -- cgit v1.2.2 From 66528fdd45b082bf7c74687d72ae08afa4a446f8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 20 Apr 2010 13:52:41 -0600 Subject: x86/PCI: parse additional host bridge window resource types This adds support for Memory24, Memory32, and Memory32Fixed descriptors in PCI host bridge _CRS. I experimentally determined that Windows (2008 R2) accepts these descriptors and treats them as windows that are forwarded to the PCI bus, e.g., if it finds any PCI devices with BARs outside the windows, it moves them into the windows. I don't know whether any machines actually use these descriptors in PCI host bridge _CRS methods, but if any exist and they're new enough that we automatically turn on "pci=use_crs", they will work with Windows but not with Linux. Here are the details: https://bugzilla.kernel.org/show_bug.cgi?id=15817 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 334153ca4c30..44f83ce02470 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -66,13 +66,44 @@ resource_to_addr(struct acpi_resource *resource, struct acpi_resource_address64 *addr) { acpi_status status; - - status = acpi_resource_to_address64(resource, addr); - if (ACPI_SUCCESS(status) && - (addr->resource_type == ACPI_MEMORY_RANGE || - addr->resource_type == ACPI_IO_RANGE) && - addr->address_length > 0) { + struct acpi_resource_memory24 *memory24; + struct acpi_resource_memory32 *memory32; + struct acpi_resource_fixed_memory32 *fixed_memory32; + + memset(addr, 0, sizeof(*addr)); + switch (resource->type) { + case ACPI_RESOURCE_TYPE_MEMORY24: + memory24 = &resource->data.memory24; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = memory24->minimum; + addr->address_length = memory24->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; + return AE_OK; + case ACPI_RESOURCE_TYPE_MEMORY32: + memory32 = &resource->data.memory32; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = memory32->minimum; + addr->address_length = memory32->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; return AE_OK; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: + fixed_memory32 = &resource->data.fixed_memory32; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = fixed_memory32->address; + addr->address_length = fixed_memory32->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; + return AE_OK; + case ACPI_RESOURCE_TYPE_ADDRESS16: + case ACPI_RESOURCE_TYPE_ADDRESS32: + case ACPI_RESOURCE_TYPE_ADDRESS64: + status = acpi_resource_to_address64(resource, addr); + if (ACPI_SUCCESS(status) && + (addr->resource_type == ACPI_MEMORY_RANGE || + addr->resource_type == ACPI_IO_RANGE) && + addr->address_length > 0) { + return AE_OK; + } + break; } return AE_ERROR; } -- cgit v1.2.2 From 7ce5a2b9bb2e92902230e3121d8c3047fab9cb47 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 23 Apr 2010 16:17:40 -0700 Subject: x86-64: Clear a 64-bit FS/GS base on fork if selector is nonzero When we do a thread switch, we clear the outgoing FS/GS base if the corresponding selector is nonzero. This is taken by __switch_to() as an entry invariant; it does not verify that it is true on entry. However, copy_thread() doesn't enforce this constraint, which can result in inconsistent results after fork(). Make copy_thread() match the behavior of __switch_to(). Reported-and-tested-by: Samuel Thibault Signed-off-by: H. Peter Anvin LKML-Reference: <4BD1E061.8030605@zytor.com> Cc: --- arch/x86/kernel/process_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index dc9690b4c4cc..17cb3295cbf7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -276,12 +276,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, set_tsk_thread_flag(p, TIF_FORK); - p->thread.fs = me->thread.fs; - p->thread.gs = me->thread.gs; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); + p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; savesegment(fs, p->thread.fsindex); + p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); -- cgit v1.2.2 From 7a0fc404ae663776e96db43879a0fa24fec1fa3a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Apr 2010 14:40:54 -0700 Subject: x86: Disable large pages on CPUs with Atom erratum AAE44 Atom erratum AAE44/AAF40/AAG38/AAH41: "If software clears the PS (page size) bit in a present PDE (page directory entry), that will cause linear addresses mapped through this PDE to use 4-KByte pages instead of using a large page after old TLB entries are invalidated. Due to this erratum, if a code fetch uses this PDE before the TLB entry for the large page is invalidated then it may fetch from a different physical address than specified by either the old large page translation or the new 4-KByte page translation. This erratum may also cause speculative code fetches from incorrect addresses." [http://download.intel.com/design/processor/specupdt/319536.pdf] Where as commit 211b3d03c7400f48a781977a50104c9d12f4e229 seems to workaround errata AAH41 (mixed 4K TLBs) it reduces the window of opportunity for the bug to occur and does not totally remove it. This patch disables mixed 4K/4MB page tables totally avoiding the page splitting and not tripping this processor issue. This is based on an original patch by Colin King. Originally-by: Colin Ian King Cc: Colin Ian King Cc: Ingo Molnar Signed-off-by: H. Peter Anvin LKML-Reference: <1269271251-19775-1-git-send-email-colin.king@canonical.com> Cc: --- arch/x86/kernel/cpu/intel.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7e1cca13af35..1366c7cfd483 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -47,6 +47,27 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + /* + * Atom erratum AAE44/AAF40/AAG38/AAH41: + * + * A race condition between speculative fetches and invalidating + * a large page. This is worked around in microcode, but we + * need the microcode to have already been loaded... so if it is + * not, recommend a BIOS update and disable large pages. + */ + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) { + u32 ucode, junk; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, junk, ucode); + + if (ucode < 0x20e) { + printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); + } + } + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #else -- cgit v1.2.2 From 453dc65931915abc61f92e12bba1fc4747ff5542 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 23 Apr 2010 13:18:08 -0400 Subject: VMware Balloon driver This is a standalone version of VMware Balloon driver. Ballooning is a technique that allows hypervisor dynamically limit the amount of memory available to the guest (with guest cooperation). In the overcommit scenario, when hypervisor set detects that it needs to shuffle some memory, it instructs the driver to allocate certain number of pages, and the underlying memory gets returned to the hypervisor. Later hypervisor may return memory to the guest by reattaching memory to the pageframes and instructing the driver to "deflate" balloon. We are submitting a standalone driver because KVM maintainer (Avi Kivity) expressed opinion (rightly) that our transport does not fit well into virtqueue paradigm and thus it does not make much sense to integrate with virtio. There were also some concerns whether current ballooning technique is the right thing. If there appears a better framework to achieve this we are prepared to evaluate and switch to using it, but in the meantime we'd like to get this driver upstream. We want to get the driver accepted in distributions so that users do not have to deal with an out-of-tree module and many distributions have "upstream first" requirement. The driver has been shipping for a number of years and users running on VMware platform will have it installed as part of VMware Tools even if it will not come from a distribution, thus there should not be additional risk in pulling the driver into mainline. The driver will only activate if host is VMware so everyone else should not be affected at all. Signed-off-by: Dmitry Torokhov Cc: Avi Kivity Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/vmware.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 1cbed97b59cf..dfdb4dba2320 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -22,6 +22,7 @@ */ #include +#include #include #include #include @@ -101,6 +102,7 @@ int vmware_platform(void) return 0; } +EXPORT_SYMBOL(vmware_platform); /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. -- cgit v1.2.2 From 55051feb57eba600b366006757304a0af3ada2bd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 23 Apr 2010 17:05:24 -0600 Subject: x86/PCI: never allocate PCI MMIO resources below BIOS_END When we move a PCI device or assign resources to a device not configured by the BIOS, we want to avoid the BIOS region below 1MB. Note that if the BIOS places devices below 1MB, we leave them there. See https://bugzilla.kernel.org/show_bug.cgi?id=15744 and https://bugzilla.kernel.org/show_bug.cgi?id=15841 Tested-by: Andy Isaacson Tested-by: Andy Bailey Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 46fd43f79103..97da2ba9344b 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -72,6 +72,9 @@ pcibios_align_resource(void *data, const struct resource *res, return start; if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; + } else if (res->flags & IORESOURCE_MEM) { + if (start < BIOS_END) + start = BIOS_END; } return start; } -- cgit v1.2.2 From 48728e077480910df45baabc5f87b04276348c90 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Apr 2010 14:45:43 -0600 Subject: x86/PCI: compute Address Space length rather than using _LEN ACPI _CRS Address Space Descriptors have _MIN, _MAX, and _LEN. Linux has been computing Address Spaces as [_MIN to _MIN + _LEN - 1]. Based on the tests in the bug reports below, Windows apparently uses [_MIN to _MAX]. Per spec (ACPI 4.0, Table 6-40), for _CRS fixed-size, fixed location descriptors, "_LEN must be (_MAX - _MIN + 1)", and when that's true, it doesn't matter which way we compute the end. But of course, there are BIOSes that don't follow this rule, and we're better off if Linux handles those exceptions the same way as Windows. This patch makes Linux use [_MIN to _MAX], as Windows seems to do. This effectively reverts d558b483d5 and 03db42adfe and replaces them with simpler code. https://bugzilla.kernel.org/show_bug.cgi?id=14337 (round) https://bugzilla.kernel.org/show_bug.cgi?id=15480 (truncate) Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 44f83ce02470..31930fd30ea9 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -121,30 +121,6 @@ count_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static void -align_resource(struct acpi_device *bridge, struct resource *res) -{ - int align = (res->flags & IORESOURCE_MEM) ? 16 : 4; - - /* - * Host bridge windows are not BARs, but the decoders on the PCI side - * that claim this address space have starting alignment and length - * constraints, so fix any obvious BIOS goofs. - */ - if (!IS_ALIGNED(res->start, align)) { - dev_printk(KERN_DEBUG, &bridge->dev, - "host bridge window %pR invalid; " - "aligning start to %d-byte boundary\n", res, align); - res->start &= ~(align - 1); - } - if (!IS_ALIGNED(res->end + 1, align)) { - dev_printk(KERN_DEBUG, &bridge->dev, - "host bridge window %pR invalid; " - "aligning end to %d-byte boundary\n", res, align); - res->end = ALIGN(res->end, align) - 1; - } -} - static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data) { @@ -154,7 +130,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) acpi_status status; unsigned long flags; struct resource *root, *conflict; - u64 start, end, max_len; + u64 start, end; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -171,19 +147,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; - max_len = addr.maximum - addr.minimum + 1; - if (addr.address_length > max_len) { - dev_printk(KERN_DEBUG, &info->bridge->dev, - "host bridge window length %#llx doesn't fit in " - "%#llx-%#llx, trimming\n", - (unsigned long long) addr.address_length, - (unsigned long long) addr.minimum, - (unsigned long long) addr.maximum); - addr.address_length = max_len; - } - start = addr.minimum + addr.translation_offset; - end = start + addr.address_length - 1; + end = addr.maximum + addr.translation_offset; res = &info->res[info->res_num]; res->name = info->name; @@ -191,7 +156,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->start = start; res->end = end; res->child = NULL; - align_resource(info->bridge, res); if (!pci_use_crs) { dev_printk(KERN_DEBUG, &info->bridge->dev, -- cgit v1.2.2 From e67a807f3d9a82fa91817871f1c0e2e04da993b8 Mon Sep 17 00:00:00 2001 From: Liang Li Date: Fri, 30 Apr 2010 18:01:51 +0800 Subject: x86: Fix 'reservetop=' functionality When specifying the 'reservetop=0xbadc0de' kernel parameter, the kernel will stop booting due to a early_ioremap bug that relates to commit 8827247ff. The root cause of boot failure problem is the value of 'slot_virt[i]' was initialized in setup_arch->early_ioremap_init(). But later in setup_arch, the function 'parse_early_param' will modify 'FIXADDR_TOP' when 'reservetop=0xbadc0de' being specified. The simplest fix might be use __fix_to_virt(idx0) to get updated value of 'FIXADDR_TOP' in '__early_ioremap' instead of reference old value from slot_virt[slot] directly. Changelog since v0: -v1: When reservetop being handled then FIXADDR_TOP get adjusted, Hence check prev_map then re-initialize slot_virt and PMD based on new FIXADDR_TOP. -v2: place fixup_early_ioremap hence call early_ioremap_init in reserve_top_address to re-initialize slot_virt and corresponding PMD when parse_reservertop -v3: move fixup_early_ioremap out of reserve_top_address to make sure other clients of reserve_top_address like xen/lguest won't broken Signed-off-by: Liang Li Tested-by: Konrad Rzeszutek Wilk Acked-by: Yinghai Lu Acked-by: Jeremy Fitzhardinge Cc: Wang Chen Cc: "H. Peter Anvin" Cc: Andrew Morton LKML-Reference: <1272621711-8683-1-git-send-email-liang.li@windriver.com> [ fixed three small cleanliness details in fixup_early_ioremap() ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 1 + arch/x86/mm/ioremap.c | 14 ++++++++++++++ arch/x86/mm/pgtable_32.c | 1 + 3 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index a1dcfa3ab17d..30a3e9776123 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -347,6 +347,7 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr, extern void __iomem *early_memremap(resource_size_t phys_addr, unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); +extern void fixup_early_ioremap(void); #define IO_SPACE_LIMIT 0xffff diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 5eb1ba74a3a9..12e4d2d3c110 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -448,6 +448,20 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; +void __init fixup_early_ioremap(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i]) { + WARN_ON(1); + break; + } + } + + early_ioremap_init(); +} + static int __init check_early_ioremap_leak(void) { int count = 0; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 1a8faf09afed..26eadaa60e6c 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -128,6 +128,7 @@ static int __init parse_reservetop(char *arg) address = memparse(arg, &arg); reserve_top_address(address); + fixup_early_ioremap(); return 0; } early_param("reservetop", parse_reservetop); -- cgit v1.2.2 From bbd391a15d82e14efe9d69ba64cadb855b061dba Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 27 Apr 2010 11:24:42 -0400 Subject: x86: Fix NULL pointer access in irq_force_complete_move() for Xen guests Upstream PV guests fail to boot because of a NULL pointer in irq_force_complete_move(). It is possible that xen guests have irq_desc->chip_data = NULL. Test for NULL chip_data pointer before attempting to complete an irq move. Signed-off-by: Prarit Bhargava LKML-Reference: <20100427152434.16193.49104.sendpatchset@prarit.bos.redhat.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin Cc: [2.6.33] --- arch/x86/kernel/apic/io_apic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 127b8718abfb..eb2789c3f721 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2545,6 +2545,9 @@ void irq_force_complete_move(int irq) struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg = desc->chip_data; + if (!cfg) + return; + __irq_complete_move(&desc, cfg->vector); } #else -- cgit v1.2.2 From 56f0e74c9cf98941af700b61466648a2d06277bb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 May 2010 09:19:43 +0200 Subject: x86: Fix parse_reservetop() build failure on certain configs Commit e67a807 ("x86: Fix 'reservetop=' functionality") added a fixup_early_ioremap() call to parse_reservetop() and declared it in io.h. But asm/io.h was only included indirectly - and on some configs not at all, causing a build failure on those configs. Cc: Liang Li Cc: Konrad Rzeszutek Wilk Cc: Yinghai Lu Cc: Jeremy Fitzhardinge Cc: Wang Chen Cc: "H. Peter Anvin" Cc: Andrew Morton LKML-Reference: <1272621711-8683-1-git-send-email-liang.li@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 26eadaa60e6c..792854003ed3 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -18,6 +18,7 @@ #include #include #include +#include unsigned int __VMALLOC_RESERVE = 128 << 20; -- cgit v1.2.2 From b810e94c9d8e3fff6741b66cd5a6f099a7887871 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Wed, 31 Mar 2010 21:56:45 +0200 Subject: powernow-k8: Fix frequency reporting With F10, model 10, all valid frequencies are in the ACPI _PST table. Cc: # 33.x 32.x Signed-off-by: Mark Langsdorf LKML-Reference: <1270065406-1814-6-git-send-email-bp@amd64.org> Signed-off-by: Borislav Petkov Reviewed-by: Thomas Renninger Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index d360b56e9825..b6215b9798e2 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -929,7 +929,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, powernow_table[i].index = index; /* Frequency may be rounded for these */ - if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { + if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10) + || boot_cpu_data.x86 == 0x11) { powernow_table[i].frequency = freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); } else -- cgit v1.2.2 From a66f6375bdeb64d7a56c532bda7c006358845820 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 4 May 2010 13:42:53 +0100 Subject: Fix the x86_64 implementation of call_rwsem_wait() The x86_64 call_rwsem_wait() treats the active state counter part of the R/W semaphore state as being 16-bit when it's actually 32-bit (it's half of the 64-bit state). It should do "decl %edx" not "decw %dx". Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- arch/x86/lib/rwsem_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S index 15acecf0d7aa..41fcf00e49df 100644 --- a/arch/x86/lib/rwsem_64.S +++ b/arch/x86/lib/rwsem_64.S @@ -60,7 +60,7 @@ ENTRY(call_rwsem_down_write_failed) ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) - decw %dx /* do nothing if still outstanding active readers */ + decl %edx /* do nothing if still outstanding active readers */ jnz 1f save_common_regs movq %rax,%rdi -- cgit v1.2.2 From b0c4d952a158a6a2547672cf4fc9d55e415410de Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 6 May 2010 02:24:34 -0700 Subject: x86: Fix fake apicid to node mapping for numa emulation With NUMA emulation, it's possible for a single cpu to be bound to multiple nodes since more than one may have affinity if allocated on a physical node that is local to the cpu. APIC ids must therefore be mapped to the lowest node ids to maintain generic kernel use of functions such as cpu_to_node() that determine device affinity. For example, if a device has proximity to physical node 1, for instance, and a cpu happens to be mapped to a higher emulated node id 8, the proximity may not be correctly determined by comparison in generic code even though the cpu may be truly local and allocated on physical node 1. When this happens, the true topology of the machine isn't accurately represented in the emulated environment; although this isn't critical to the system's uptime, any generic code that is NUMA aware benefits from the physical topology being accurately represented. This can affect any system that maps multiple APIC ids to a single node and is booted with numa=fake=N where N is greater than the number of physical nodes. Signed-off-by: David Rientjes Cc: Yinghai Lu Cc: Suresh Siddha LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 28c68762648f..38512d0c4742 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -461,7 +461,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) * node, it must now point to the fake node ID. */ for (j = 0; j < MAX_LOCAL_APIC; j++) - if (apicid_to_node[j] == nid) + if (apicid_to_node[j] == nid && + fake_apicid_to_node[j] == NUMA_NO_NODE) fake_apicid_to_node[j] = i; } for (i = 0; i < num_nodes; i++) -- cgit v1.2.2 From 829e92458532b1dbfeb972435d45bb060cdbf5a3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Apr 2010 18:33:49 -0400 Subject: kprobes/x86: Fix removed int3 checking order Fix kprobe/x86 to check removed int3 when failing to get kprobe from hlist. Since we have a time window between checking int3 exists on probed address and getting kprobe on that address, we can have following scenario: ------- CPU1 CPU2 hit int3 check int3 exists remove int3 remove kprobe from hlist get kprobe from hlist no kprobe->OOPS! ------- This patch moves int3 checking if there is no kprobe on that address for fixing this problem as follows: ------ CPU1 CPU2 hit int3 remove int3 remove kprobe from hlist get kprobe from hlist no kprobe->check int3 exists ->rollback&retry ------ Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: systemtap Cc: DLE Cc: Dave Anderson Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20100427223348.2322.9112.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index b43bbaebe2c0..1658efdfb4e5 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -534,20 +534,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) struct kprobe_ctlblk *kcb; addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->ip = (unsigned long)addr; - return 1; - } - /* * We don't want to be preempted for the entire * duration of kprobe processing. We conditionally @@ -579,6 +565,19 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) setup_singlestep(p, regs, kcb, 0); return 1; } + } else if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->ip = (unsigned long)addr; + preempt_enable_no_resched(); + return 1; } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { -- cgit v1.2.2 From 061e2fd16863009c8005b4b5fdfb75c7215c0b99 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 5 May 2010 16:04:43 +0200 Subject: KVM: SVM: Fix wrong intercept masks on 32 bit This patch makes KVM on 32 bit SVM working again by correcting the masks used for iret interception. With the wrong masks the upper 32 bits of the intercepts are masked out which leaves vmrun unintercepted. This is not legal on svm and the vmrun fails. Bug was introduced by commits 95ba827313 and 3cfc3092. Cc: Jan Kiszka Cc: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2ba58206812a..737361fcd503 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2067,7 +2067,7 @@ static int cpuid_interception(struct vcpu_svm *svm) static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; - svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; return 1; } @@ -2479,7 +2479,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -2539,10 +2539,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) if (masked) { svm->vcpu.arch.hflags |= HF_NMI_MASK; - svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); } else { svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); } } -- cgit v1.2.2 From fe19c5a46b4c519153fddd4d5efe32a3e4cfa694 Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:21:33 +0800 Subject: KVM: x86: Call vcpu_load and vcpu_put in cpuid_update cpuid_update may operate VMCS, so vcpu_load() and vcpu_put() should be called to ensure correctness. Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3c4ca98ad27f..c4f35b545c1d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1712,6 +1712,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; + vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1729,6 +1730,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); + vcpu_put(vcpu); out_free: vfree(cpuid_entries); @@ -1749,9 +1751,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; + vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); + vcpu_put(vcpu); return 0; out: -- cgit v1.2.2 From f8c5fae16649445e15656667f72bd51d777f7766 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 11 May 2010 15:16:46 +0200 Subject: KVM: VMX: blocked-by-sti must not defer NMI injections As the processor may not consider GUEST_INTR_STATE_STI as a reason for blocking NMI, it could return immediately with EXIT_REASON_NMI_WINDOW when we asked for it. But as we consider this state as NMI-blocking, we can run into an endless loop. Resolve this by allowing NMI injection if just GUEST_INTR_STATE_STI is active (originally suggested by Gleb). Intel confirmed that this is safe, the processor will never complain about NMI injection in this state. Signed-off-by: Jan Kiszka KVM-Stable-Tag Acked-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bc933cfb4e66..2f8db0ec8ae4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2703,8 +2703,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) return 0; return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_NMI)); + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); } static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From f01487119dda3d9f58c9729c7361ecc50a61c188 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 27 Apr 2010 12:13:48 +0200 Subject: x86, amd: Check X86_FEATURE_OSVW bit before accessing OSVW MSRs If host CPU is exposed to a guest the OSVW MSRs are not guaranteed to be present and a GP fault occurs. Thus checking the feature flag is essential. Cc: # .32.x .33.x Signed-off-by: Andreas Herrmann LKML-Reference: <20100427101348.GC4489@alberich.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 28ad9f4d8b94..0415c3ef91b5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) * check OSVW bit for CPUs that are not affected * by erratum #400 */ - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); - if (val >= 2) { - rdmsrl(MSR_AMD64_OSVW_STATUS, val); - if (!(val & BIT(1))) - goto no_c1e_idle; + if (cpu_has(c, X86_FEATURE_OSVW)) { + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); + if (val >= 2) { + rdmsrl(MSR_AMD64_OSVW_STATUS, val); + if (!(val & BIT(1))) + goto no_c1e_idle; + } } return 1; } -- cgit v1.2.2 From ade029e2aaacc8965a548b0b0f80c5bee97ffc68 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 24 Apr 2010 09:56:53 +0200 Subject: x86, k8: Fix build error when K8_NB is disabled K8_NB depends on PCI and when the last is disabled (allnoconfig) we fail at the final linking stage due to missing exported num_k8_northbridges. Add a header stub for that. Signed-off-by: Borislav Petkov LKML-Reference: <20100503183036.GJ26107@aftab> Signed-off-by: H. Peter Anvin Cc: --- arch/x86/include/asm/k8.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index f70e60071fe8..af00bd1d2089 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -16,11 +16,16 @@ extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); extern int k8_scan_nodes(void); #ifdef CONFIG_K8_NB +extern int num_k8_northbridges; + static inline struct pci_dev *node_to_k8_nb_misc(int node) { return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; } + #else +#define num_k8_northbridges 0 + static inline struct pci_dev *node_to_k8_nb_misc(int node) { return NULL; -- cgit v1.2.2 From 7f284d3cc96e02468a42e045f77af11e5ff8b095 Mon Sep 17 00:00:00 2001 From: Frank Arnold Date: Thu, 22 Apr 2010 16:06:59 +0200 Subject: x86, cacheinfo: Turn off L3 cache index disable feature in virtualized environments When running a quest kernel on xen we get: BUG: unable to handle kernel NULL pointer dereference at 0000000000000038 IP: [] cpuid4_cache_lookup_regs+0x2ca/0x3df PGD 0 Oops: 0000 [#1] SMP last sysfs file: CPU 0 Modules linked in: Pid: 0, comm: swapper Tainted: G W 2.6.34-rc3 #1 /HVM domU RIP: 0010:[] [] cpuid4_cache_lookup_regs+0x 2ca/0x3df RSP: 0018:ffff880002203e08 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060 RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000 RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38 R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0 R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68 FS: 0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020) Stack: ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30 <0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70 <0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b Call Trace: [] ? sync_supers_timer_fn+0x0/0x1c [] ? mod_timer+0x23/0x25 [] ? arm_supers_timer+0x34/0x36 [] ? hrtimer_get_next_event+0xa7/0xc3 [] ? get_next_timer_interrupt+0x19a/0x20d [] get_cpu_leaves+0x5c/0x232 [] ? sched_clock_local+0x1c/0x82 [] ? sched_clock_tick+0x75/0x7a [] generic_smp_call_function_single_interrupt+0xae/0xd0 [] smp_call_function_single_interrupt+0x18/0x27 [] call_function_single_interrupt+0x13/0x20 [] ? notifier_call_chain+0x14/0x63 [] ? native_safe_halt+0xc/0xd [] ? default_idle+0x36/0x53 [] cpu_idle+0xaa/0xe4 [] rest_init+0x7e/0x80 [] start_kernel+0x40e/0x419 [] x86_64_start_reservations+0xb3/0xb7 [] x86_64_start_kernel+0xf8/0x107 Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb RIP [] cpuid4_cache_lookup_regs+0x2ca/0x3df RSP CR2: 0000000000000038 ---[ end trace a7919e7f17c0a726 ]--- The L3 cache index disable feature of AMD CPUs has to be disabled if the kernel is running as guest on top of a hypervisor because northbridge devices are not available to the guest. Currently, this fixes a boot crash on top of Xen. In the future this will become an issue on KVM as well. Check if northbridge devices are present and do not enable the feature if there are none. [ hpa: backported to 2.6.34 ] Signed-off-by: Frank Arnold LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org> Acked-by: Borislav Petkov Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index b3eeb66c0a51..95962a93f99a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -340,6 +340,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) (boot_cpu_data.x86_mask < 0x1))) return; + /* not in virtualized environments */ + if (num_k8_northbridges == 0) + return; + this_leaf->can_disable = true; this_leaf->l3_indices = amd_calc_l3_indices(); } -- cgit v1.2.2 From e9b1d5d0ff4d3ae86050dc4c91b3147361c7af9e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 14 May 2010 13:55:57 -0700 Subject: x86, mrst: Don't blindly access extended config space Do not blindly access extended configuration space unless we actively know we're on a Moorestown platform. The fixed-size BAR capability lives in the extended configuration space, and thus is not applicable if the configuration space isn't appropriately sized. This fixes booting certain VMware configurations with CONFIG_MRST=y. Moorestown will add a fake PCI-X 266 capability to advertise the presence of extended configuration space. Reported-and-tested-by: Petr Vandrovec Signed-off-by: H. Peter Anvin Acked-by: Jacob Pan Acked-by: Jesse Barnes LKML-Reference: --- arch/x86/pci/mrst.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 8bf2fcb88d04..1cdc02cf8fa4 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -247,6 +247,10 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev) u32 size; int i; + /* Must have extended configuration space */ + if (dev->cfg_size < PCIE_CAP_OFFSET + 4) + return; + /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ offset = fixed_bar_cap(dev->bus, dev->devfn); if (!offset || PCI_DEVFN(2, 0) == dev->devfn || -- cgit v1.2.2