From dd17c8f72993f9461e9c19250e3f155d6d99df22 Mon Sep 17 00:00:00 2001 From: Rusty Russell <rusty@rustcorp.com.au> Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: remove per_cpu__ prefix. Now that the return from alloc_percpu is compatible with the address of per-cpu vars, it makes sense to hand around the address of per-cpu variables. To make this sane, we remove the per_cpu__ prefix we used created to stop people accidentally using these vars directly. Now we have sparse, we can use that (next patch). tj: * Updated to convert stuff which were missed by or added after the original patch. * Kill per_cpu_var() macro. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> --- arch/x86/kernel/apic/nmi.c | 6 +++--- arch/x86/kernel/head_32.S | 6 +++--- arch/x86/kernel/vmlinux.lds.S | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index e631cc4416f7..45404379d173 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -437,8 +437,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - __this_cpu_inc(per_cpu_var(alert_counter)); - if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) + __this_cpu_inc(alert_counter); + if (__this_cpu_read(alert_counter) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -446,7 +446,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(per_cpu_var(alert_counter), 0); + __this_cpu_write(alert_counter, 0); } /* see if the nmi watchdog went off */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 050c278481b1..fd39eaf83b84 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -438,8 +438,8 @@ is386: movl $2,%ecx # set MP */ cmpb $0,ready jne 1f - movl $per_cpu__gdt_page,%eax - movl $per_cpu__stack_canary,%ecx + movl $gdt_page,%eax + movl $stack_canary,%ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -702,7 +702,7 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ + .long gdt_page /* Overwritten for secondary CPUs */ /* * The boot_gdt must mirror the equivalent in setup.S and is diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9fa..ecb92717c412 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -312,7 +312,7 @@ SECTIONS * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); @@ -323,7 +323,7 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((per_cpu__irq_stack_union == 0), +. = ASSERT((irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif -- cgit v1.2.2 From b76365a18f7593c9df32a74bf2b4a39741b67bc6 Mon Sep 17 00:00:00 2001 From: Russ Anderson <rja@sgi.com> Date: Thu, 17 Dec 2009 10:53:25 -0600 Subject: x86, uv: Add serial number parameter to uv_bios_get_sn_info() Add system_serial_number to the information returned by uv_bios_get_sn_info() UV BIOS call. Signed-off-by: Russ Anderson <rja@sgi.com> LKML-Reference: <20091217165323.GA30774@sgi.com> Cc: Jack Steiner <steiner@sgi.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/x2apic_uv_x.c | 6 +++--- arch/x86/kernel/bios_uv.c | 20 ++++++++++++++------ 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b684bb303cbf..af5d103bb533 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -627,8 +627,8 @@ void __init uv_system_init(void) } uv_bios_init(); - uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &sn_coherency_id, &sn_region_size); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, + &sn_region_size, &system_serial_number); uv_rtc_init(); for_each_present_cpu(cpu) { diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index b0206a211b09..c918ebab52ab 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -15,8 +15,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson <rja@sgi.com> */ #include <linux/efi.h> @@ -30,6 +30,7 @@ static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { struct uv_systab *tab = &uv_systab; + s64 ret; if (!tab->function) /* @@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - return efi_call6((void *)__va(tab->function), - (u64)which, a1, a2, a3, a4, a5); + ret = efi_call6((void *)__va(tab->function), (u64)which, + a1, a2, a3, a4, a5); + return ret; } +EXPORT_SYMBOL_GPL(uv_bios_call); s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) @@ -73,11 +76,14 @@ long sn_coherency_id; EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); +long system_serial_number; +EXPORT_SYMBOL_GPL(system_serial_number); int uv_type; +EXPORT_SYMBOL_GPL(uv_type); s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, - long *region) + long *region, long *ssn) { s64 ret; u64 v0, v1; @@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, *coher = part.coherence_id; if (region) *region = part.region_size; + if (ssn) + *ssn = v1; return ret; } +EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); int uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, @@ -185,4 +194,3 @@ void uv_bios_init(void) void uv_bios_init(void) { } #endif - -- cgit v1.2.2 From 0f764806438d5576ac58898332e5dcf30bb8a679 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Mon, 21 Dec 2009 15:51:23 +0100 Subject: x86/amd-iommu: Fix initialization failure panic The assumption that acpi_table_parse passes the return value of the hanlder function to the caller proved wrong recently. The return value of the handler function is totally ignored. This makes the initialization code for AMD IOMMU buggy in a way that could cause a kernel panic on initialization. This patch fixes the issue in the AMD IOMMU driver. Cc: stable@kernel.org Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu_init.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 1dca9c34eaeb..fb490ce7dd55 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -137,6 +137,11 @@ int amd_iommus_present; /* IOMMUs have a non-present cache? */ bool amd_iommu_np_cache __read_mostly; +/* + * Set to true if ACPI table parsing and hardware intialization went properly + */ +static bool amd_iommu_initialized; + /* * List of protection domains - used during resume */ @@ -929,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table) } WARN_ON(p != end); + amd_iommu_initialized = true; + return 0; } @@ -1263,6 +1270,9 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; + if (!amd_iommu_initialized) + goto free; + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; -- cgit v1.2.2 From fd2a50a0240f5f5b59070474eabd83a85720a406 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Date: Thu, 24 Dec 2009 01:54:47 +0000 Subject: x86, perfctr: Remove unused func avail_to_resrv_perfctr_nmi() avail_to_resrv_perfctr_nmi() is neither EXPORT'd, nor used in the file. So remove it. Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Acked-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: oprofile-list@lists.sf.net LKML-Reference: <20091224015441.6005.4408.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perfctr-watchdog.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 898df9719afb..74f4e85a5727 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) return !test_bit(counter, perfctr_nmi_owner); } - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); int reserve_perfctr_nmi(unsigned int msr) -- cgit v1.2.2 From d015a092989d673df44a5ad6866dc5d5006b7a2a Mon Sep 17 00:00:00 2001 From: Pekka Enberg <penberg@cs.helsinki.fi> Date: Mon, 28 Dec 2009 10:26:59 +0200 Subject: x86: Use KERN_DEFAULT log-level in __show_regs() Andrew Morton reported a strange looking kmemcheck warning: WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88004fba6c20) 0000000000000000310000000000000000000000000000002413000000c9ffff u u u u u u u u u u u u u u u u i i i i i i i i u u u u u u u u [<ffffffff810af3aa>] kmemleak_scan+0x25a/0x540 [<ffffffff810afbcb>] kmemleak_scan_thread+0x5b/0xe0 [<ffffffff8104d0fe>] kthread+0x9e/0xb0 [<ffffffff81003074>] kernel_thread_helper+0x4/0x10 [<ffffffffffffffff>] 0xffffffffffffffff The above printout is missing register dump completely. The problem here is that the output comes from syslog which doesn't show KERN_INFO log-level messages. We didn't see this before because both of us were testing on 32-bit kernels which use the _default_ log-level. Fix that up by explicitly using KERN_DEFAULT log-level for __show_regs() printks. Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Vegard Nossum <vegard.nossum@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Arjan van de Ven <arjan@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> LKML-Reference: <1261988819.4641.2.camel@penberg-laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/process_32.c | 14 +++++++------- arch/x86/kernel/process_64.c | 24 ++++++++++++------------ 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 98c2cdeb599e..c6ee241c8a98 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -103,8 +103,8 @@ void show_regs_common(void) if (!product) product = ""; - printk("\n"); - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9c517b5858f0..37ad1e046aae 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -139,16 +139,16 @@ void __show_regs(struct pt_regs *regs, int all) show_regs_common(); - printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); print_symbol("EIP is at %s\n", regs->ip); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); if (!all) @@ -158,19 +158,19 @@ void __show_regs(struct pt_regs *regs, int all) cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); - printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", d6, d7); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 52fbd0c60198..f9e033150cdf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,19 +161,19 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int ds, cs, es; show_regs_common(); - printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); - printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); - printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); @@ -194,21 +194,21 @@ void __show_regs(struct pt_regs *regs, int all) cr3 = read_cr3(); cr4 = read_cr4(); - printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) -- cgit v1.2.2 From 39d30770992895d55789de64bad2349510af68d0 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 28 Dec 2009 13:28:25 -0800 Subject: x86: SGI UV: Fix writes to led registers on remote uv hubs The wrong address was being used to write the SCIR led regs on remote hubs. Also, there was an inconsistency between how BIOS and the kernel indexed these regs. Standardize on using the lower 6 bits of the APIC ID as the index. This patch fixes the problem of writing to an errant address to a cpu # >= 64. Signed-off-by: Mike Travis <travis@sgi.com> Reviewed-by: Jack Steiner <steiner@sgi.com> Cc: Robin Holt <holt@sgi.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: stable@kernel.org LKML-Reference: <4B3922F9.3060905@sgi.com> [ v2: fix a number of annoying checkpatch artifacts and whitespace noise ] Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/x2apic_uv_x.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d56b0efb2057..5f92494dab61 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -629,8 +629,10 @@ void __init uv_system_init(void) uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -651,15 +653,13 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", + cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ -- cgit v1.2.2 From 9959c888a38b0f25b0e81a480f537d6489348442 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Mon, 28 Dec 2009 21:08:29 -0800 Subject: x86: Increase NR_IRQS and nr_irqs I have a system with lots of igb and ixgbe, when iov/vf are enabled for them, we hit the limit of 3064. when system has 20 pcie installed, and one card has 2 functions, and one function needs 64 msi-x, may need 20 * 2 * 64 = 2560 for msi-x but if iov and vf are enabled may need 20 * 2 * 64 * 3 = 7680 for msi-x assume system with 5 ioapic, nr_irqs_gsi will be 120. NR_CPUS = 512, and nr_cpu_ids = 128 will have NR_IRQS = 256 + 512 * 64 = 33024 will have nr_irqs = 120 + 8 * 128 + 120 * 64 = 8824 When SPARSE_IRQ is not set, there is no increase with kernel data size. when NR_CPUS=128, and SPARSE_IRQ is set: text data bss dec hex filename 21837444 4216564 12480736 38534744 24bfe58 vmlinux.before 21837442 4216580 12480736 38534758 24bfe66 vmlinux.after when NR_CPUS=4096, and SPARSE_IRQ is set text data bss dec hex filename 21878619 5610244 13415392 40904255 270263f vmlinux.before 21878617 5610244 13415392 40904253 270263d vmlinux.after Signed-off-by: Yinghai Lu <yinghai@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> LKML-Reference: <4B398ECD.1080506@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index de00c4619a55..d9cd1f1b9c07 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3840,7 +3840,7 @@ int __init arch_probe_nr_irqs(void) /* * for MSI and HT dyn irq */ - nr += nr_irqs_gsi * 16; + nr += nr_irqs_gsi * 64; #endif if (nr < nr_irqs) nr_irqs = nr; -- cgit v1.2.2 From 9a7262a0563da6b91019156abf487bcdf1a41526 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 28 Dec 2009 13:28:25 -0800 Subject: x86_64 SGI UV: Fix writes to led registers on remote uv hubs. The wrong address was being used to write the SCIR led regs on remote hubs. Also, there was an inconsistency between how BIOS and the kernel indexed these regs. Standardize on using the lower 6 bits of the APIC ID as the index. This patch fixes the problem of writing to an errant address to a cpu # >= 64. Signed-off-by: Mike Travis <travis@sgi.com> Reviewed-by: Jack Steiner <steiner@sgi.com> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/apic/x2apic_uv_x.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d56b0efb2057..5f92494dab61 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -629,8 +629,10 @@ void __init uv_system_init(void) uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -651,15 +653,13 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", + cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ -- cgit v1.2.2 From d7f0eea9e431e1b8b0742a74db1a9490730b2a25 Mon Sep 17 00:00:00 2001 From: Zhang Rui <rui.zhang@intel.com> Date: Wed, 30 Dec 2009 15:36:42 +0800 Subject: ACPI: introduce kernel parameter acpi_sleep=sci_force_enable Introduce kernel parameter acpi_sleep=sci_force_enable some laptop requires SCI_EN being set directly on resume, or else they hung somewhere in the resume code path. We already have a blacklist for these laptops but we still need this option, especially when debugging some suspend/resume problems, in case there are systems that need this workaround and are not yet in the blacklist. Signed-off-by: Zhang Rui <rui.zhang@intel.com> Acked-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Len Brown <len.brown@intel.com> --- arch/x86/kernel/acpi/sleep.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b91..f9961034e557 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "sci_force_enable", 16) == 0) + acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); -- cgit v1.2.2 From 48b5ba9cc98d676712da29d9931f1c88e5185ff2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Thu, 31 Dec 2009 05:53:02 +0100 Subject: perf: Pass appropriate frame pointer to dump_trace() Pass the frame pointer from the regs of the interrupted path to dump_trace() while processing the stack trace. Currently, dump_trace() takes the current bp and starts the callchain from dump_trace() itself. This is wasteful because we need to walk through the entire NMI/DEBUG stack before retrieving the interrupted point. We can fix that by just using the frame pointer from the captured regs. It points exactly where we want to start. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> LKML-Reference: <1262235183-5320-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c223b7e895d9..d616c06e99b4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2347,7 +2347,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_KERNEL); callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } /* -- cgit v1.2.2 From 9dad0fd5a73d4048dff18069733c0b515f68df74 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Tue, 22 Dec 2009 15:40:39 -0800 Subject: x86: Fix size for ex trampoline with 32bit fix for error that is introduced by | x86: Use find_e820() instead of hard coded trampoline address it should end with PAGE_SIZE + PAGE_SIZE Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1261525263-13763-2-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 05ed7ab2ca48..a1a7876cadcb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -733,13 +733,13 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 }, + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, #endif {} -- cgit v1.2.2 From ea94396629a3e0cb9a3a9c75335b1de255b30426 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Mon, 4 Jan 2010 21:14:41 -0800 Subject: x86, apic: Don't waste a vector to improve vector spread We want to use a vector-assignment sequence that avoids stumbling onto 0x80 earlier in the sequence, in order to improve the spread of vectors across priority levels on machines with a small number of interrupt sources. Right now, this is done by simply making the first vector (0x31 or 0x41) completely unusable. This is unnecessary; all we need is to start assignment at a +1 offset, we don't actually need to prohibit the usage of this vector once we have wrapped around. Signed-off-by: H. Peter Anvin <hpa@zytor.com> LKML-Reference: <4B426550.6000209@kernel.org> --- arch/x86/kernel/apic/io_apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d9cd1f1b9c07..e9ba0903e9d5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1162,7 +1162,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + static int current_vector = FIRST_DEVICE_VECTOR + VECTOR_OFFSET_START; + static int current_offset = VECTOR_OFFSET_START % 8; unsigned int old_vector; int cpu, err; cpumask_var_t tmp_mask; -- cgit v1.2.2 From 7f41c2e1523f628cc248e34192162aec5728bed7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Wed, 6 Jan 2010 10:56:31 -0800 Subject: x86, irq: Check move_in_progress before freeing the vector mapping With the recent irq migration fixes (post 2.6.32), Gary Hade has noticed "No IRQ handler for vector" messages during the 2.6.33-rc1 kernel boot on IBM AMD platforms and root caused the issue to this commit: > commit 23359a88e7eca3c4f402562b102f23014db3c2aa > Author: Suresh Siddha <suresh.b.siddha@intel.com> > Date: Mon Oct 26 14:24:33 2009 -0800 > > x86: Remove move_cleanup_count from irq_cfg As part of this patch, we have removed the move_cleanup_count check in smp_irq_move_cleanup_interrupt(). With this change, we can run into a situation where an irq cleanup interrupt on a cpu can cleanup the vector mappings associated with multiple irqs, of which one of the irq's migration might be still in progress. As such when that irq hits the old cpu, we get the "No IRQ handler" messages. Fix this by checking for the irq_cfg's move_in_progress and if the move is still in progress delay the vector cleanup to another irq cleanup interrupt request (which will happen when the irq starts arriving at the new cpu destination). Reported-and-tested-by: Gary Hade <garyhade@us.ibm.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <1262804191.2732.7.camel@sbs-t61.sc.intel.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index de00c4619a55..53243ca7816d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2434,6 +2434,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); raw_spin_lock(&desc->lock); + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) + goto unlock; + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; -- cgit v1.2.2 From 8558e3943df1c51c3377cb4e8a52ea484d6f357d Mon Sep 17 00:00:00 2001 From: Len Brown <len.brown@intel.com> Date: Wed, 6 Jan 2010 16:11:06 -0500 Subject: x86, ACPI: delete acpi_boot_table_init() return value cleanup only. setup_arch(), doesn't care care if ACPI initialization succeeded or failed, so delete acpi_boot_table_init()'s return value. Signed-off-by: Len Brown <len.brown@intel.com> --- arch/x86/kernel/acpi/boot.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb1035cd9a6a..036d28adf59d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1529,16 +1529,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * if acpi_blacklisted() acpi_disabled = 1; * acpi_irq_model=... * ... - * - * return value: (currently ignored) - * 0: success - * !0: failure */ -int __init acpi_boot_table_init(void) +void __init acpi_boot_table_init(void) { - int error; - dmi_check_system(acpi_dmi_table); /* @@ -1546,15 +1540,14 @@ int __init acpi_boot_table_init(void) * One exception: acpi=ht continues far enough to enumerate LAPICs */ if (acpi_disabled && !acpi_ht) - return 1; + return; /* * Initialize the ACPI boot-time table parser. */ - error = acpi_table_init(); - if (error) { + if (acpi_table_init()) { disable_acpi(); - return error; + return; } acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1562,18 +1555,15 @@ int __init acpi_boot_table_init(void) /* * blacklist may disable ACPI entirely */ - error = acpi_blacklisted(); - if (error) { + if (acpi_blacklisted()) { if (acpi_force) { printk(KERN_WARNING PREFIX "acpi=force override\n"); } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return error; + return; } } - - return 0; } int __init early_acpi_boot_init(void) -- cgit v1.2.2 From 99659a929d653d0c9ce458091870544768add871 Mon Sep 17 00:00:00 2001 From: Roel Kluin <roel.kluin@gmail.com> Date: Thu, 7 Jan 2010 15:35:42 +0100 Subject: x86, uv: Remove recursion in uv_heartbeat_enable() The recursion is not needed and does not improve readability. Signed-off-by: Roel Kluin <roel.kluin@gmail.com> LKML-Reference: <4B45F13E.3040202@gmail.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/x2apic_uv_x.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index af5d103bb533..d199dc34f54a 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -475,7 +475,7 @@ static void uv_heartbeat(unsigned long ignored) static void __cpuinit uv_heartbeat_enable(int cpu) { - if (!uv_cpu_hub_info(cpu)->scir.enabled) { + while (!uv_cpu_hub_info(cpu)->scir.enabled) { struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); @@ -483,11 +483,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu) timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_hub_info(cpu)->scir.enabled = 1; - } - /* check boot cpu */ - if (!uv_cpu_hub_info(0)->scir.enabled) - uv_heartbeat_enable(0); + /* also ensure that boot cpu is enabled */ + cpu = 0; + } } #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.2 From 066000dd856709b6980123eb39b957fe26993f7b Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Date: Mon, 11 Jan 2010 15:51:04 -0800 Subject: Revert "x86, apic: Use logical flat on intel with <= 8 logical cpus" Revert commit 2fbd07a5f5d1295fa9b0c0564ec27da7c276a75a, as this commit breaks an IBM platform with quad-core Xeon cpu's. According to Suresh, this might be an IBM platform issue, as on other Intel platforms with <= 8 logical cpu's, logical flat mode works fine irespective of physical apic id values (inline with the xapic architecture). Revert this for now because of the IBM platform breakage. Another version will be re-submitted after the complete analysis. Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Acked-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/apic/apic.c | 26 ++++++++++++++++++-------- arch/x86/kernel/apic/probe_64.c | 15 ++++----------- 2 files changed, 22 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index aa57c079c98f..e80f291472a4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -62,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. * - * On AMD, this determines the messaging protocol we can use: if all APIC IDs + * This determines the messaging protocol we can use: if all APIC IDs * are in the 0 ... 7 range, then we can use logical addressing which * has some performance advantages (better broadcasting). * @@ -1898,14 +1898,24 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - def_to_bigsmp = 1; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj <ashok.raj@intel.com> + */ + if (max_physical_apicid >= 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: def_to_bigsmp = 1; + } } #endif diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c4cbd3080c1c..65edc180fc82 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,23 +64,16 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif if (apic == &apic_flat) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - apic = &apic_physflat; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - apic = &apic_physflat; - } + if (max_physical_apicid >= 8) + apic = &apic_physflat; + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); - if (is_vsmp_box()) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; -- cgit v1.2.2 From c2c5d45d46c8c0fd34291dec958670ad4816796f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Thu, 31 Dec 2009 03:52:25 +0100 Subject: perf: Stop stack frame walking off kernel addresses boundaries While processing kernel perf callchains, an bad entry can be considered as a valid stack pointer but not as a kernel address. In this case, we hang in an endless loop. This can happen in an x86-32 kernel after processing the last entry in a kernel stacktrace. Just stop the stack frame walking after we encounter an invalid kernel address. This fixes a hard lockup in x86-32. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> LKML-Reference: <1262227945-27014-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/dumpstack.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c56bc2873030..6d817554780a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -123,13 +123,15 @@ print_context_stack_bp(struct thread_info *tinfo, while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { unsigned long addr = *ret_addr; - if (__kernel_text_address(addr)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); - } + if (!__kernel_text_address(addr)) + break; + + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); } + return (unsigned long)frame; } EXPORT_SYMBOL_GPL(print_context_stack_bp); -- cgit v1.2.2 From 0fb8ee48d9dfff6a0913ceb0be2068d8be203763 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Thu, 31 Dec 2009 05:53:03 +0100 Subject: perf: Drop useless check for ignored frame The check that ignores the debug and nmi stack frames is useless now that we have a frame pointer that makes us start at the right place. We don't anymore have to deal with these. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> LKML-Reference: <1262235183-5320-2-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 8 -------- arch/x86/kernel/dumpstack_32.c | 5 ----- arch/x86/kernel/dumpstack_64.c | 5 ----- 3 files changed, 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d616c06e99b4..b1bb8c550526 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2297,7 +2297,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_ignored_frame); static void @@ -2313,10 +2312,6 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_ignored_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name) || - x86_is_stack_id(DEBUG_STACK, name); - return 0; } @@ -2324,9 +2319,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_ignored_frame, smp_processor_id())) - return; - if (reliable) callchain_store(entry, addr); } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index ae775ca47b25..11540a189d93 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -18,11 +18,6 @@ #include "dumpstack.h" -/* Just a stub for now */ -int x86_is_stack_id(int id, char *name) -{ - return 0; -} void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f5..676bc051252e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = { #endif }; -int x86_is_stack_id(int id, char *name) -{ - return x86_stack_ids[id - 1] == name; -} - static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) { -- cgit v1.2.2 From aa5add93e92019018e905146f8c3d3f8e3c08300 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Tue, 5 Jan 2010 17:46:56 -0500 Subject: x86/ptrace: Remove unused regs_get_argument_nth API Because of dropping function argument syntax from kprobe-tracer, we don't need this API anymore. Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Roland McGrath <roland@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Michael Neuling <mikey@neuling.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: linuxppc-dev@ozlabs.org LKML-Reference: <20100105224656.19431.92588.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/ptrace.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..73554a3aae8c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -140,30 +140,6 @@ static const int arg_offs_table[] = { #endif }; -/** - * regs_get_argument_nth() - get Nth argument at function call - * @regs: pt_regs which contains registers at function entry. - * @n: argument number. - * - * regs_get_argument_nth() returns @n th argument of a function call. - * Since usually the kernel stack will be changed right after function entry, - * you must use this at function entry. If the @n th entry is NOT in the - * kernel stack or pt_regs, this returns 0. - */ -unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) -{ - if (n < ARRAY_SIZE(arg_offs_table)) - return *(unsigned long *)((char *)regs + arg_offs_table[n]); - else { - /* - * The typical case: arg n is on the stack. - * (Note: stack[0] = return address, so skip it) - */ - n -= ARRAY_SIZE(arg_offs_table); - return regs_get_kernel_stack_nth(regs, 1 + n); - } -} - /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. -- cgit v1.2.2 From fcfbb2b5facd65efa7284cc315225bfe3d1856c2 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Fri, 8 Jan 2010 12:13:54 -0800 Subject: x86: SGI UV: Fix mapping of MMIO registers This fixes the problem of the initialization code not correctly mapping the entire MMIO space on a UV system. A side effect is the map_high() interface needed to be changed to accommodate different address and size shifts. Signed-off-by: Mike Travis <travis@sgi.com> Reviewed-by: Mike Habeck <habeck@sgi.com> Cc: <stable@kernel.org> Cc: Jack Steiner <steiner@sgi.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> LKML-Reference: <4B479202.7080705@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/x2apic_uv_x.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 5f92494dab61..b8bb869a6618 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -374,13 +374,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, - int max_pnode, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int pshift, + int bshift, int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; - paddr = base << shift; - bytes = (1UL << shift) * (max_pnode + 1); + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -396,7 +396,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) { - map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)gru.s.base << shift); gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); @@ -410,7 +410,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); + map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -420,7 +420,8 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); + map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, + max_pnode, map_uc); } static __init void map_low_mmrs(void) -- cgit v1.2.2 From 42590a75019a50012f25a962246498dead428433 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Date: Mon, 4 Jan 2010 16:16:23 +0900 Subject: x86/agp: Fix agp_amd64_init and agp_amd64_cleanup This fixes the regression introduced by the commit f405d2c02395a74d3883bd03ded36457aa3697ad. The above commit fixes the following issue: http://marc.info/?l=linux-kernel&m=126192729110083&w=2 However, it doesn't work properly when you remove and insert the agp_amd64 module again. agp_amd64_init() and agp_amd64_cleanup should be called only when gart_iommu is not called earlier (that is, the GART IOMMU is not enabled). We need to use 'gart_iommu_aperture' to see if GART IOMMU is enabled or not. Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: mitov@issp.bas.bg Cc: davej@redhat.com LKML-Reference: <20100104161603L.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/aperture_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3704997e8b25..f147a95fd84a 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -31,6 +31,7 @@ #include <asm/x86_init.h> int gart_iommu_aperture; +EXPORT_SYMBOL_GPL(gart_iommu_aperture); int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; -- cgit v1.2.2 From 864a0922dd128392467611d9857e5138c6a91999 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@openvz.org> Date: Wed, 13 Jan 2010 10:16:07 +0000 Subject: x86: kernel_thread() -- initialize SS to a known state Before the kernel_thread was converted into "C" we had pt_regs::ss set to __KERNEL_DS (by SAVE_ALL asm macro). Though I must admit I didn't find any *explicit* load of %ss from this structure the better to be on a safe side and set it to a known value. Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> Signed-off-by: Ian Campbell <ian.campbell@citrix.com> Cc: Christian Kujau <lists@nerdbynature.de> Cc: Jeremy Fitzhardinge <Jeremy.Fitzhardinge@citrix.com> Cc: Brian Gerst <brgerst@gmail.com> LKML-Reference: <1263377768-19600-1-git-send-email-ian.campbell@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c6ee241c8a98..02c3ee013ccd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -288,6 +288,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; regs.gs = __KERNEL_STACK_CANARY; +#else + regs.ss = __KERNEL_DS; #endif regs.orig_ax = -1; -- cgit v1.2.2 From 557a701c16553b0b691dbb64ef30361115a80f64 Mon Sep 17 00:00:00 2001 From: Thomas Renninger <trenn@suse.de> Date: Mon, 14 Dec 2009 11:44:15 +0100 Subject: [CPUFREQ] Fix use after free of struct powernow_k8_data Easy fix for a regression introduced in 2.6.31. On managed CPUs the cpufreq.c core will call driver->exit(cpu) on the managed cpus and powernow_k8 will free the core's data. Later driver->get(cpu) function might get called trying to read out the current freq of a managed cpu and the NULL pointer check does not work on the freed object -> better set it to NULL. ->get() is unsigned and must return 0 as invalid frequency. Reference: http://bugzilla.kernel.org/show_bug.cgi?id=14391 Signed-off-by: Thomas Renninger <trenn@suse.de> Tested-by: Michal Schmidt <mschmidt@redhat.com> CC: stable@kernel.org Signed-off-by: Dave Jones <davej@redhat.com> --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a9df9441a9a2..2da4fa3bf6e9 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) kfree(data->powernow_table); kfree(data); + per_cpu(powernow_data, pol->cpu) = NULL; return 0; } @@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu) int err; if (!data) - return -EINVAL; + return 0; smp_call_function_single(cpu, query_values_on_cpu, &err, true); if (err) -- cgit v1.2.2 From 0f1d683fb35d6c6f49ef696c95757f3970682a0e Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Date: Thu, 17 Dec 2009 20:18:27 +0000 Subject: [CPUFREQ] Processor Clocking Control interface driver Processor Clocking Control (PCC) is an interface between the BIOS and OSPM. Based on the server workload, OSPM can request what frequency it expects from a logical CPU, and the BIOS will achieve that frequency transparently. This patch introduces driver support for PCC. OSPM uses the PCC driver to communicate with the BIOS via the PCC interface. There is a Documentation file that provides a link to the PCC Specification, and also provides a summary of the PCC interface. Currently, certain HP ProLiant platforms implement the PCC interface. However, any platform whose BIOS implements the PCC Specification, can utilize this driver. V2 --> V1 changes (based on Dominik's suggestions): - Removed the dependency on CPU_FREQ_TABLE - "cpufreq_stats" will no longer PANIC. Actually, it will not load anymore because it is not applicable. - Removed the sanity check for target frequency in the ->target routine. NOTE: A patch to sanitize the target frequency requested by "ondemand" is needed to ensure that the target freq < policy->min. Can this driver be queued up for the 2.6.33 tree? Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Signed-off-by: Matthew Garrett <mjg@redhat.com> Signed-off-by: Thomas Renninger <trenn@suse.de> Signed-off-by: Dave Jones <davej@redhat.com> --- arch/x86/kernel/cpu/cpufreq/Kconfig | 14 + arch/x86/kernel/cpu/cpufreq/Makefile | 1 + arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 621 ++++++++++++++++++++++++++++++ 3 files changed, 636 insertions(+) create mode 100644 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index f138c6c389b9..870e6cc6ad28 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -10,6 +10,20 @@ if CPU_FREQ comment "CPUFreq processor drivers" +config X86_PCC_CPUFREQ + tristate "Processor Clocking Control interface driver" + depends on ACPI && ACPI_PROCESSOR + help + This driver adds support for the PCC interface. + + For details, take a look at: + <file:Documentation/cpu-freq/pcc-cpufreq.txt>. + + To compile this driver as a module, choose M here: the + module will be called pcc-cpufreq. + + If in doubt, say N. + config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 509296df294d..1840c0a5170b 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c new file mode 100644 index 000000000000..29368854533c --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -0,0 +1,621 @@ +/* + * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/sched.h> +#include <linux/cpufreq.h> +#include <linux/compiler.h> + +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> + +#include <acpi/processor.h> + +#define PCC_VERSION "1.00.00" +#define POLL_LOOPS 300 + +#define CMD_COMPLETE 0x1 +#define CMD_GET_FREQ 0x0 +#define CMD_SET_FREQ 0x1 + +#define BUF_SZ 4 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ + "pcc-cpufreq", msg) + +struct pcc_register_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 resource_usage; + u8 type_specific; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { + u32 signature; + u16 length; + u8 major; + u8 minor; + u32 features; + u16 command; + u16 status; + u32 latency; + u32 minimum_time; + u32 maximum_time; + u32 nominal; + u32 throttled_frequency; + u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, + 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { + u32 input_offset; + u32 output_offset; +}; + +static struct pcc_cpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static inline void pcc_cmd(void) +{ + u64 doorbell_value; + int i; + + acpi_read(&doorbell_value, &doorbell); + acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, + &doorbell); + + for (i = 0; i < POLL_LOOPS; i++) { + if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) + break; + } +} + +static inline void pcc_clear_mapping(void) +{ + if (pcch_virt_addr) + iounmap(pcch_virt_addr); + pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ + struct pcc_cpu *pcc_cpu_data; + unsigned int curr_freq; + unsigned int freq_limit; + u16 status; + u32 input_buffer; + u32 output_buffer; + + spin_lock(&pcc_lock); + + dprintk("get: get_freq for CPU %d\n", cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + input_buffer = 0x1; + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + output_buffer = + ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("get: FAILED: for CPU %d, status is %d\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) + / 100) * 1000); + + dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", + cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), + output_buffer, curr_freq); + + freq_limit = (output_buffer >> 8) & 0xff; + if (freq_limit != 0xff) { + dprintk("get: frequency for cpu %d is being temporarily" + " capped at %d\n", cpu, curr_freq); + } + + spin_unlock(&pcc_lock); + return curr_freq; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct pcc_cpu *pcc_cpu_data; + struct cpufreq_freqs freqs; + u16 status; + u32 input_buffer; + int cpu; + + spin_lock(&pcc_lock); + cpu = policy->cpu; + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + dprintk("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%x\n", + cpu, target_freq, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + + freqs.new = target_freq; + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + input_buffer = 0x1 | (((target_freq * 100) + / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("target: FAILED for cpu %d, with status: 0x%x\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); + spin_unlock(&pcc_lock); + + return 0; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ + acpi_status status; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *pccp, *offset; + struct pcc_cpu *pcc_cpu_data; + struct acpi_processor *pr; + int ret = 0; + + pr = per_cpu(processors, cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + pccp = buffer.pointer; + if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + }; + + offset = &(pccp->package.elements[0]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->input_offset = offset->integer.value; + + offset = &(pccp->package.elements[1]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->output_offset = offset->integer.value; + + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + + dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " + "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", + cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: + kfree(buffer.pointer); + return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ + acpi_status status; + struct acpi_object_list input; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u32 capabilities[2]; + u32 errors; + u32 supported; + int ret = 0; + + input.count = 4; + input.pointer = in_params; + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 2; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 8; + in_params[3].buffer.pointer = (u8 *)&capabilities; + + capabilities[0] = OSC_QUERY_ENABLE; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + + kfree(output.pointer); + capabilities[0] = 0x0; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + +out_free: + kfree(output.pointer); + return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ + acpi_status status; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + struct pcc_memory_resource *mem_resource; + struct pcc_register_resource *reg_resource; + union acpi_object *out_obj, *member; + acpi_handle handle, osc_handle; + int ret = 0; + + status = acpi_get_handle(NULL, "\\_SB", &handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "_OSC", &osc_handle); + if (ACPI_SUCCESS(status)) { + ret = pcc_cpufreq_do_osc(&osc_handle); + if (ret) + dprintk("probe: _OSC evaluation did not succeed\n"); + /* Firmware's use of _OSC is optional */ + ret = 0; + } + + status = acpi_evaluate_object(handle, "PCCH", NULL, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + } + + member = &out_obj->package.elements[0]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + + dprintk("probe: mem_resource descriptor: 0x%x," + " length: %d, space_id: %d, resource_usage: %d," + " type_specific: %d, granularity: 0x%llx," + " minimum: 0x%llx, maximum: 0x%llx," + " translation_offset: 0x%llx, address_length: 0x%llx\n", + mem_resource->descriptor, mem_resource->length, + mem_resource->space_id, mem_resource->resource_usage, + mem_resource->type_specific, mem_resource->granularity, + mem_resource->minimum, mem_resource->maximum, + mem_resource->translation_offset, + mem_resource->address_length); + + if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + ret = -ENODEV; + goto out_free; + } + + pcch_virt_addr = ioremap_nocache(mem_resource->minimum, + mem_resource->address_length); + if (pcch_virt_addr == NULL) { + dprintk("probe: could not map shared mem region\n"); + goto out_free; + } + pcch_hdr = pcch_virt_addr; + + dprintk("probe: PCCH header (virtual) addr: 0x%llx\n", + (u64)pcch_hdr); + dprintk("probe: PCCH header is at physical address: 0x%llx," + " signature: 0x%x, length: %d bytes, major: %d, minor: %d," + " supported features: 0x%x, command field: 0x%x," + " status field: 0x%x, nominal latency: %d us\n", + mem_resource->minimum, ioread32(&pcch_hdr->signature), + ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), + ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), + ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), + ioread32(&pcch_hdr->latency)); + + dprintk("probe: min time between commands: %d us," + " max time between commands: %d us," + " nominal CPU frequency: %d MHz," + " minimum CPU frequency: %d MHz," + " minimum CPU frequency without throttling: %d MHz\n", + ioread32(&pcch_hdr->minimum_time), + ioread32(&pcch_hdr->maximum_time), + ioread32(&pcch_hdr->nominal), + ioread32(&pcch_hdr->throttled_frequency), + ioread32(&pcch_hdr->minimum_frequency)); + + member = &out_obj->package.elements[1]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto pcch_free; + } + + reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + + doorbell.space_id = reg_resource->space_id; + doorbell.bit_width = reg_resource->bit_width; + doorbell.bit_offset = reg_resource->bit_offset; + doorbell.access_width = 64; + doorbell.address = reg_resource->address; + + dprintk("probe: doorbell: space_id is %d, bit_width is %d, " + "bit_offset is %d, access_width is %d, address is 0x%llx\n", + doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, + doorbell.access_width, reg_resource->address); + + member = &out_obj->package.elements[2]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_preserve = member->integer.value; + + member = &out_obj->package.elements[3]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_write = member->integer.value; + + dprintk("probe: doorbell_preserve: 0x%llx," + " doorbell_write: 0x%llx\n", + doorbell_preserve, doorbell_write); + + pcc_cpu_info = alloc_percpu(struct pcc_cpu); + if (!pcc_cpu_info) { + ret = -ENOMEM; + goto pcch_free; + } + + printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" + " limits: %d MHz, %d MHz\n", PCC_VERSION, + ioread32(&pcch_hdr->minimum_frequency), + ioread32(&pcch_hdr->nominal)); + kfree(output.pointer); + return ret; +pcch_free: + pcc_clear_mapping(); +out_free: + kfree(output.pointer); + return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + unsigned int result = 0; + + if (!pcch_virt_addr) { + result = -1; + goto pcch_null; + } + + result = pcc_get_offset(cpu); + if (result) { + dprintk("init: PCCP evaluation failed\n"); + goto free; + } + + policy->max = policy->cpuinfo.max_freq = + ioread32(&pcch_hdr->nominal) * 1000; + policy->min = policy->cpuinfo.min_freq = + ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cur = pcc_get_freq(cpu); + + dprintk("init: policy->max is %d, policy->min is %d\n", + policy->max, policy->min); + + return 0; +free: + pcc_clear_mapping(); + free_percpu(pcc_cpu_info); +pcch_null: + return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .get = pcc_get_freq, + .verify = pcc_cpufreq_verify, + .target = pcc_cpufreq_target, + .init = pcc_cpufreq_cpu_init, + .exit = pcc_cpufreq_cpu_exit, + .name = "pcc-cpufreq", + .owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + ret = pcc_cpufreq_probe(); + if (ret) { + dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); + return ret; + } + + ret = cpufreq_register_driver(&pcc_cpufreq_driver); + + return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&pcc_cpufreq_driver); + + pcc_clear_mapping(); + + free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); -- cgit v1.2.2 From fb4635932a4e19c2f55383f968a0e9b64da37354 Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Thu, 7 Jan 2010 15:26:22 -0500 Subject: [CPUFREQ] Fix cast warning in pcc driver. arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c:458: warning: cast from pointer to integer of different size Signed-off-by: Dave Jones <davej@redhat.com> --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 29368854533c..ff36d2979a90 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -455,8 +455,7 @@ static int __init pcc_cpufreq_probe(void) } pcch_hdr = pcch_virt_addr; - dprintk("probe: PCCH header (virtual) addr: 0x%llx\n", - (u64)pcch_hdr); + dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); dprintk("probe: PCCH header is at physical address: 0x%llx," " signature: 0x%x, length: %d bytes, major: %d, minor: %d," " supported features: 0x%x, command field: 0x%x," -- cgit v1.2.2 From 7a1110e861b2666ac09f5708d6fbe71d18ce64bb Mon Sep 17 00:00:00 2001 From: Jack Steiner <steiner@sgi.com> Date: Tue, 12 Jan 2010 15:09:04 -0600 Subject: x86, uv: Add function retrieving node controller revision number Add function for determining the revision id of the SGI UV node controller chip (HUB). This function is needed in a subsequent patch. Signed-off-by: Jack Steiner <steiner@sgi.com> LKML-Reference: <20100112210904.GA24546@sgi.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/x2apic_uv_x.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b8bb869a6618..0e48de9ff864 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -36,6 +36,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +int uv_min_hub_revision_id; +EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); static inline bool is_GRU_range(u64 start, u64 end) { @@ -55,6 +57,10 @@ static int early_get_nodeid(void) mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); node_id.v = *mmr; early_iounmap(mmr, sizeof(*mmr)); + + /* Currently, all blades have same revision number */ + uv_min_hub_revision_id = node_id.s.revision; + return node_id.s.node_id; } -- cgit v1.2.2 From 1d2c867c941d635e53e8ad7bf37d060bb5b25ec5 Mon Sep 17 00:00:00 2001 From: Russ Anderson <rja@sgi.com> Date: Fri, 15 Jan 2010 12:09:09 -0600 Subject: x86, uv: Ensure hub revision set for all ACPI modes. Ensure that UV hub revision is set for all ACPI modes. Signed-off-by: Russ Anderson <rja@sgi.com> LKML-Reference: <20100115180908.GB7757@sgi.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/x2apic_uv_x.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0e48de9ff864..21db3cbea7dc 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -66,7 +66,10 @@ static int early_get_nodeid(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { + int nodeid; + if (!strcmp(oem_id, "SGI")) { + nodeid = early_get_nodeid(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; @@ -74,7 +77,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); + nodeid << (UV_APIC_PNODE_SHIFT - 1); uv_system_type = UV_NON_UNIQUE_APIC; return 1; } -- cgit v1.2.2 From 00097c4fdf117d9845d772f571a987ae95523f8c Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com> Date: Sun, 17 Jan 2010 19:44:44 -0200 Subject: x86, trivial: Fix grammo in tsc comment about Geode TSC reliability Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com> Cc: marcelo@kvack.org Cc: dilinger@collabora.co.uk Cc: trivial@kernel.org LKML-Reference: <1263764685-9871-1-git-send-email-cascardo@holoscopio.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba0..23066ecf12fa 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void) unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ + /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; #endif -- cgit v1.2.2 From 722b3654852e48b93367a63f8ada9ee1cd43f2d3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Wed, 13 Jan 2010 16:19:10 -0800 Subject: x86, vmi: Fix vmi_get_timer_vector() to use IRQ0_VECTOR FIRST_DEVICE_VECTOR is going away and it looks like a bad hack to steal FIRST_DEVICE_VECTOR / FIRST_EXTERNAL_VECTOR, when it looks like it needs IRQ0_VECTOR. Fix vmi_get_timer_vector() to use IRQ0_VECTOR. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <20100114002118.436172066@sbs-t61.sc.intel.com> Cc: Alok N Kataria <akataria@vmware.com> Cc: Zach Amsden <zach@vmware.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/vmiclock_32.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 74c92bb194df..1268d993e9ca 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void) static inline unsigned int vmi_get_timer_vector(void) { -#ifdef CONFIG_X86_IO_APIC - return FIRST_DEVICE_VECTOR; -#else - return FIRST_EXTERNAL_VECTOR; -#endif + return IRQ0_VECTOR; } /** vmi clockchip */ @@ -239,8 +235,6 @@ void __init vmi_time_init(void) vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.2 From 6579b474572fd54c583ac074e8e7aaae926c62ef Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Wed, 13 Jan 2010 16:19:11 -0800 Subject: x86, irq: Use 0x20 for the IRQ_MOVE_CLEANUP_VECTOR instead of 0x1f After talking to some more folks inside intel (Peter Anvin, Asit Mallick), the safest option (for future compatibility etc) seen was to use vector 0x20 for IRQ_MOVE_CLEANUP_VECTOR instead of using vector 0x1f (which is documented as reserved vector in the Intel IA32 manuals). Also we don't need to reserve the entire privilege level (all 16 vectors in the priority bucket that IRQ_MOVE_CLEANUP_VECTOR falls into), as the x86 architecture (section 10.9.3 in SDM Vol3a) specifies that with in the priority level, the higher the vector number the higher the priority. And hence we don't need to reserve the complete priority level 0x20-0x2f for the IRQ migration cleanup logic. So change the IRQ_MOVE_CLEANUP_VECTOR to 0x20 and allow 0x21-0x2f to be used for device interrupts. 0x30-0x3f will be used for ISA interrupts (these also can be migrated in the context of IOAPIC and hence need to be at a higher priority level than IRQ_MOVE_CLEANUP_VECTOR). Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <20100114002118.521826763@sbs-t61.sc.intel.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e9ba0903e9d5..409f4943dc1a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1162,7 +1162,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR + VECTOR_OFFSET_START; + static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 8; unsigned int old_vector; int cpu, err; @@ -1199,7 +1199,7 @@ next: if (vector >= first_system_vector) { /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; + vector = FIRST_EXTERNAL_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; -- cgit v1.2.2 From dfea91d5a7c795fd6f4e1a97489a98e4e767463e Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Mon, 18 Jan 2010 12:10:48 -0800 Subject: x86, apic: use physical mode for IBM summit platforms Chris McDermott from IBM confirmed that hurricane chipset in IBM summit platforms doesn't support logical flat mode. Irrespective of the other things like apic_id's, total number of logical cpu's, Linux kernel should default to physical mode for this system. The 32-bit kernel does so using the OEM checks for the IBM summit platform. Add a similar OEM platform check for the 64bit kernel too. Otherwise the linux kernel boot can hang on this platform under certain bios/platform settings. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Tested-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Chris McDermott <lcm@linux.vnet.ibm.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/apic/apic_flat_64.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index eacbd2b31d27..e3c3d820c325 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } #endif return 0; -- cgit v1.2.2 From bb668da6d6f2bec8a63838c098d9515eccb22cc4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Mon, 18 Jan 2010 12:10:49 -0800 Subject: x86, apic: use logical flat for systems with <= 8 logical cpus We can use logical flat mode if there are <= 8 logical cpu's (irrespective of physical apic id values). This will enable simplified and efficient IPI and device interrupt routing on such platforms. This has been tested to work on both Intel and AMD platforms. Exceptions like IBM summit platform which can't use logical flat mode are addressed by using OEM platform checks. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Chris McDermott <lcm@linux.vnet.ibm.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/apic/apic.c | 15 +-------------- arch/x86/kernel/apic/probe_64.c | 8 +++----- 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e80f291472a4..3987e4408f75 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. - * - * This determines the messaging protocol we can use: if all APIC IDs - * are in the 0 ... 7 range, then we can use logical addressing which - * has some performance advantages (better broadcasting). - * - * If there's an APIC ID above 8, we use physical addressing. */ unsigned int max_physical_apicid; @@ -1898,14 +1892,7 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj <ashok.raj@intel.com> - */ - if (max_physical_apicid >= 8) { + if (num_processors > 8) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: if (!APIC_XAPIC(version)) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 65edc180fc82..450fe2064a14 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,15 +64,13 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif - if (apic == &apic_flat) { - if (max_physical_apicid >= 8) + if (apic == &apic_flat && num_processors > 8) apic = &apic_physflat; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); - } + + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); if (is_vsmp_box()) { /* need to update phys_pkg_id */ -- cgit v1.2.2 From 97943390b043bcafca69f9163b86bbf627b75589 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Tue, 19 Jan 2010 12:20:54 -0800 Subject: x86, irq: Don't block IRQ0_VECTOR..IRQ15_VECTOR's on all cpu's Currently IRQ0..IRQ15 are assigned to IRQ0_VECTOR..IRQ15_VECTOR's on all the cpu's. If these IRQ's are handled by legacy pic controller, then the kernel handles them only on cpu 0. So there is no need to block this vector space on all cpu's. Similarly if these IRQ's are handled by IO-APIC, then the IRQ affinity will determine on which cpu's we need allocate the vector resource for that particular IRQ. This can be done dynamically and here also there is no need to block 16 vectors for IRQ0..IRQ15 on all cpu's. Fix this by initially assigning IRQ0..IRQ15 to IRQ0_VECTOR..IRQ15_VECTOR's only on cpu 0. If the legacy controllers like pic handles these irq's, then this configuration will be fixed. If more modern controllers like IO-APIC handle these IRQ's, then we start with this configuration and as IRQ's migrate, vectors (/and cpu's) associated with these IRQ's change dynamically. This will freeup the block of 16 vectors on other cpu's which don't handle IRQ0..IRQ15, which can now be used for other IRQ's that the particular cpu handle. [ hpa: this also an architectural cleanup for future legacy-PIC-free configurations. ] [ hpa: fixed typo NR_LEGACY_IRQS -> NR_IRQS_LEGACY ] Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <1263932453.2814.52.camel@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 33 ++++++++++----------------------- arch/x86/kernel/irqinit.c | 35 +++++++++++++++++------------------ arch/x86/kernel/vmiclock_32.c | 2 ++ 3 files changed, 29 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 409f4943dc1a..1a30587a6bc2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -94,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* Number of legacy interrupts */ -static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; @@ -140,27 +138,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg irq_cfgx[] = { +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; #else -static struct irq_cfg irq_cfgx[NR_IRQS] = { +static struct irq_cfg irq_cfgx[NR_IRQS]; #endif - [0] = { .vector = IRQ0_VECTOR, }, - [1] = { .vector = IRQ1_VECTOR, }, - [2] = { .vector = IRQ2_VECTOR, }, - [3] = { .vector = IRQ3_VECTOR, }, - [4] = { .vector = IRQ4_VECTOR, }, - [5] = { .vector = IRQ5_VECTOR, }, - [6] = { .vector = IRQ6_VECTOR, }, - [7] = { .vector = IRQ7_VECTOR, }, - [8] = { .vector = IRQ8_VECTOR, }, - [9] = { .vector = IRQ9_VECTOR, }, - [10] = { .vector = IRQ10_VECTOR, }, - [11] = { .vector = IRQ11_VECTOR, }, - [12] = { .vector = IRQ12_VECTOR, }, - [13] = { .vector = IRQ13_VECTOR, }, - [14] = { .vector = IRQ14_VECTOR, }, - [15] = { .vector = IRQ15_VECTOR, }, -}; void __init io_apic_disable_legacy(void) { @@ -185,8 +166,14 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < nr_legacy_irqs) - cpumask_setall(cfg[i].domain); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + */ + if (i < nr_legacy_irqs) { + cfg[i].vector = IRQ0_VECTOR + i; + cpumask_set_cpu(0, cfg[i].domain); + } } return 0; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d5932226614f..fce55d532631 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -84,24 +84,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 + [0 ... NR_VECTORS - 1] = -1, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -116,6 +99,9 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } +/* Number of legacy interrupts */ +int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; + void __init init_ISA_irqs(void) { int i; @@ -142,6 +128,19 @@ void __init init_ISA_irqs(void) void __init init_IRQ(void) { + int i; + + /* + * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * If these IRQ's are handled by legacy interrupt-controllers like PIC, + * then this configuration will likely be static after the boot. If + * these IRQ's are handled by more mordern controllers like IO-APIC, + * then this vector space can be freed and re-used dynamically as the + * irq's migrate etc. + */ + for (i = 0; i < nr_legacy_irqs; i++) + per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + x86_init.irqs.intr_init(); } diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 1268d993e9ca..2f1ca5614292 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -235,6 +235,8 @@ void __init vmi_time_init(void) vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.2 From b27d515a49169e5e2a92d621faac761074a8c5b1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Mon, 18 Jan 2010 10:58:01 +0200 Subject: perf: x86: Add support for the ANY bit Propagate the ANY bit into the fixed counter config for v3 and higher. Signed-off-by: Stephane Eranian <eranian@google.com> [a.p.zijlstra@chello.nl: split from larger patch] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <4b5430c6.0f975e0a.1bf9.ffff85fe@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d616c06e99b4..8c1c07073ccc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1343,6 +1343,13 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + bits <<= (idx * 4); mask = 0xfULL << (idx * 4); -- cgit v1.2.2 From d91afd15b041f27d34859c79afa9e172018a86f4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Fri, 22 Jan 2010 16:40:20 +0100 Subject: x86/amd-iommu: Fix possible integer overflow The variable i in this function could be increased to over 2**32 which would result in an integer overflow when using int. Fix it by changing i to unsigned long. Cc: stable@kernel.org Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 23824fef789c..c2ccbd7b862f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -980,7 +980,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; struct amd_iommu *iommu; - int i; + unsigned long i; #ifdef CONFIG_IOMMU_STRESS populate = false; -- cgit v1.2.2 From 2ca762790caf822f7b61430fbaffa3ae4219977f Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Fri, 22 Jan 2010 16:45:31 +0100 Subject: x86/amd-iommu: Fix NULL pointer dereference in __detach_device() In the __detach_device function the reference count for a device-domain binding may become zero. This results in the device being removed from the domain and dev_data->domain will be NULL. This is bad because this pointer is dereferenced when trying to unlock the domain->lock. This patch fixes the issue by keeping the domain in a seperate variable. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c2ccbd7b862f..4478a48198a8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1489,11 +1489,14 @@ static void __detach_device(struct device *dev) { struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; + struct protection_domain *domain; unsigned long flags; BUG_ON(!dev_data->domain); - spin_lock_irqsave(&dev_data->domain->lock, flags); + domain = dev_data->domain; + + spin_lock_irqsave(&domain->lock, flags); if (dev_data->alias != dev) { alias_data = get_dev_data(dev_data->alias); @@ -1504,7 +1507,7 @@ static void __detach_device(struct device *dev) if (atomic_dec_and_test(&dev_data->bind)) do_detach(dev); - spin_unlock_irqrestore(&dev_data->domain->lock, flags); + spin_unlock_irqrestore(&domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the -- cgit v1.2.2 From f5325094379158e6b876ea0010c807bf7890ec8f Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Fri, 22 Jan 2010 17:44:35 +0100 Subject: x86/amd-iommu: Fix IOMMU-API initialization for iommu=pt This patch moves the initialization of the iommu-api out of the dma-ops initialization code. This ensures that the iommu-api is initialized even with iommu=pt. Cc: stable@kernel.org Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 8 ++++++-- arch/x86/kernel/amd_iommu_init.c | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 4478a48198a8..751ce73c6e1b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2221,6 +2221,12 @@ static struct dma_map_ops amd_iommu_dma_ops = { /* * The function which clues the AMD IOMMU driver into dma_ops. */ + +void __init amd_iommu_init_api(void) +{ + register_iommu(&amd_iommu_ops); +} + int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; @@ -2256,8 +2262,6 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; - register_iommu(&amd_iommu_ops); - amd_iommu_stats_init(); return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fb490ce7dd55..9dc91b431470 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1292,9 +1292,12 @@ static int __init amd_iommu_init(void) ret = amd_iommu_init_passthrough(); else ret = amd_iommu_init_dma_ops(); + if (ret) goto free; + amd_iommu_init_api(); + amd_iommu_init_notifier(); enable_iommus(); -- cgit v1.2.2 From d3ad9373b7c29b63d5e8460a69453718d200cc3b Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Fri, 22 Jan 2010 17:55:27 +0100 Subject: x86/amd-iommu: Fix deassignment of a device from the pt_domain Deassigning a device from the passthrough domain does not work and breaks device assignment to kvm guests. This patch fixes the issue. Cc: stable@kernel.org Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 751ce73c6e1b..adb0ba025702 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1511,9 +1511,11 @@ static void __detach_device(struct device *dev) /* * If we run in passthrough mode the device must be assigned to the - * passthrough domain if it is detached from any other domain + * passthrough domain if it is detached from any other domain. + * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through && dev_data->domain == NULL) + if (iommu_pass_through && + (dev_data->domain == NULL && domain != pt_domain)) __attach_device(dev, pt_domain); } -- cgit v1.2.2 From dcf39daf3d6d97f8741e82f0b9fb7554704ed2d1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov <borislav.petkov@amd.com> Date: Fri, 22 Jan 2010 16:01:05 +0100 Subject: x86, cacheinfo: Fix disabling of L3 cache indices * Correct the masks used for writing the cache index disable indices. * Do not turn off L3 scrubber - it is not necessary. * Make sure wbinvd is executed on the same node where the L3 is. * Check for out-of-bounds values written to the registers. * Make show_cache_disable hex values unambiguous * Check for Erratum #388 Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> LKML-Reference: <1264172467-25155-4-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/intel_cacheinfo.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc6c8ef92dcc..08c91abc4d32 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -18,6 +18,7 @@ #include <asm/processor.h> #include <linux/smp.h> #include <asm/k8.h> +#include <asm/smp.h> #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -299,8 +300,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) if (boot_cpu_data.x86 == 0x11) return; - /* see erratum #382 */ - if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + /* see errata #382 and #388 */ + if ((boot_cpu_data.x86 == 0x10) && + ((boot_cpu_data.x86_model < 0x9) || + (boot_cpu_data.x86_mask < 0x1))) return; this_leaf->can_disable = 1; @@ -726,12 +729,12 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, return -EINVAL; pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "%x\n", reg); + return sprintf(buf, "0x%08x\n", reg); } #define SHOW_CACHE_DISABLE(index) \ static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ { \ return show_cache_disable(this_leaf, buf, index); \ } @@ -745,7 +748,9 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; - unsigned int scrubber = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff if (!this_leaf->can_disable) return -EINVAL; @@ -759,21 +764,24 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - val |= 0xc0000000; - - pci_read_config_dword(dev, 0x58, &scrubber); - scrubber &= ~0x1f000000; - pci_write_config_dword(dev, 0x58, scrubber); + /* do not allow writes outside of allowed bits */ + if (val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) + return -EINVAL; - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); - wbinvd(); + val |= BIT(30); pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); return count; } #define STORE_CACHE_DISABLE(index) \ static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ const char *buf, size_t count) \ { \ return store_cache_disable(this_leaf, buf, count, index); \ -- cgit v1.2.2 From 897de50e08937663912c86fb12ad7f708af2386c Mon Sep 17 00:00:00 2001 From: Borislav Petkov <borislav.petkov@amd.com> Date: Fri, 22 Jan 2010 16:01:06 +0100 Subject: x86, cacheinfo: Add cache index disable sysfs attrs only to L3 caches The cache_disable_[01] attribute in /sys/devices/system/cpu/cpu?/cache/index[0-3]/ is enabled on all cache levels although only L3 supports it. Add it only to the cache level that actually supports it. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> LKML-Reference: <1264172467-25155-5-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/intel_cacheinfo.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 08c91abc4d32..3976ce95095f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -814,16 +814,24 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); +#define DEFAULT_SYSFS_CACHE_ATTRS \ + &type.attr, \ + &level.attr, \ + &coherency_line_size.attr, \ + &physical_line_partition.attr, \ + &ways_of_associativity.attr, \ + &number_of_sets.attr, \ + &size.attr, \ + &shared_cpu_map.attr, \ + &shared_cpu_list.attr + static struct attribute *default_attrs[] = { - &type.attr, - &level.attr, - &coherency_line_size.attr, - &physical_line_partition.attr, - &ways_of_associativity.attr, - &number_of_sets.attr, - &size.attr, - &shared_cpu_map.attr, - &shared_cpu_list.attr, + DEFAULT_SYSFS_CACHE_ATTRS, + NULL +}; + +static struct attribute *default_l3_attrs[] = { + DEFAULT_SYSFS_CACHE_ATTRS, &cache_disable_0.attr, &cache_disable_1.attr, NULL @@ -916,6 +924,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; + struct _cpuid4_info *this_leaf; int retval; retval = cpuid4_cache_sysfs_init(cpu); @@ -934,6 +943,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; + + this_leaf = CPUID4_INFO_IDX(cpu, i); + + if (this_leaf->can_disable) + ktype_cache.default_attrs = default_l3_attrs; + else + ktype_cache.default_attrs = default_attrs; + retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), -- cgit v1.2.2 From 048a8774ca43488d78605031f11cc206d7a2682a Mon Sep 17 00:00:00 2001 From: Borislav Petkov <borislav.petkov@amd.com> Date: Fri, 22 Jan 2010 16:01:07 +0100 Subject: x86, cacheinfo: Calculate L3 indices We need to know the valid L3 indices interval when disabling them over /sysfs. Do that when the core is brought online and add boundary checks to the sysfs .store attribute. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> LKML-Reference: <1264172467-25155-6-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/intel_cacheinfo.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 3976ce95095f..589b705e80ed 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -151,7 +151,8 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; @@ -161,7 +162,8 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; }; unsigned short num_cache_leaves; @@ -291,6 +293,29 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +static unsigned int __cpuinit amd_calc_l3_indices(void) +{ + /* + * We're called over smp_call_function_single() and therefore + * are on the correct cpu. + */ + int cpu = smp_processor_id(); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int sc0, sc1, sc2, sc3; + u32 val; + + pci_read_config_dword(dev, 0x1C4, &val); + + /* calculate subcache sizes */ + sc0 = !(val & BIT(0)); + sc1 = !(val & BIT(4)); + sc2 = !(val & BIT(8)) + !(val & BIT(9)); + sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; +} + static void __cpuinit amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { @@ -306,7 +331,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) (boot_cpu_data.x86_mask < 0x1))) return; - this_leaf->can_disable = 1; + this_leaf->can_disable = true; + this_leaf->l3_indices = amd_calc_l3_indices(); } static int @@ -765,7 +791,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, return -EINVAL; /* do not allow writes outside of allowed bits */ - if (val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) return -EINVAL; val |= BIT(30); -- cgit v1.2.2 From 73472a46b5b28116b145fb5fc05242c1aa8e1461 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com> Date: Thu, 21 Jan 2010 11:09:52 -0800 Subject: x86: Disable HPET MSI on ATI SB700/SB800 HPET MSI on platforms with ATI SB700/SB800 as they seem to have some side-effects on floppy DMA. Do not use HPET MSI on such platforms. Original problem report from Mark Hounschell http://lkml.indiana.edu/hypermail/linux/kernel/0912.2/01118.html [ This patch needs to go to stable as well. But, there are some conflicts that prevents the patch from going as is. I can rebase/resubmit to stable once the patch goes upstream. hpa: still Cc:'ing stable@ as an FYI. ] Tested-by: Mark Hounschell <markh@compro.net> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: <stable@kernel.org> LKML-Reference: <20100121190952.GA32523@linux-os.sc.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/hpet.c | 8 ++++++++ arch/x86/kernel/quirks.c | 13 +++++++++++++ 2 files changed, 21 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba6e65884603..ad80a1c718c6 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -34,6 +34,8 @@ */ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ +u8 hpet_msi_disable; + #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -596,6 +598,9 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (hpet_msi_disable) + return; + if (boot_cpu_has(X86_FEATURE_ARAT)) return; id = hpet_readl(HPET_ID); @@ -928,6 +933,9 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (hpet_msi_disable) + return 0; + if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 18093d7498f0..12e9feaa2f7a 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,6 +491,19 @@ void force_hpet_resume(void) break; } } + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + #endif #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) -- cgit v1.2.2 From 3b2e3d85aeb80769fb96c15ee4f6e14135328471 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 22 Jan 2010 21:34:56 +0100 Subject: Revert "x86: ucode-amd: Load ucode-patches once ..." Commit d1c84f79a6ba992dc01e312c44a21496303874d6 leads to a regression when microcode_amd.c is compiled into the kernel. It causes a big boot delay because the firmware is not available. See http://marc.info/?l=linux-kernel&m=126267290920060 It also renders the reload sysfs attribute useless. Fixing this is too intrusive for an -rc5 kernel. Thus I'd like to restore the microcode loading behaviour of kernel 2.6.32. CC: Gene Heskett <gene.heskett@verizon.net> Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> LKML-Reference: <20100122203456.GB13792@alberich.amd.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/microcode_amd.c | 44 ++++++++++++---------------------------- arch/x86/kernel/microcode_core.c | 6 ------ 2 files changed, 13 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 37542b67c57e..e1af7c055c7d 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -36,9 +36,6 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 -const struct firmware *firmware; -static int supported_cpu; - struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -77,12 +74,15 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { + struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - if (!supported_cpu) - return -1; - memset(csig, 0, sizeof(*csig)); + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); + return -1; + } rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; @@ -294,10 +294,14 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; enum ucode_state ret; - if (firmware == NULL) + if (request_firmware(&firmware, fw_name, device)) { + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return UCODE_NFOUND; + } if (*(u32 *)firmware->data != UCODE_MAGIC) { pr_err("invalid UCODE_MAGIC (0x%08x)\n", @@ -307,6 +311,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) ret = generic_load_microcode(cpu, firmware->data, firmware->size); + release_firmware(firmware); + return ret; } @@ -325,31 +331,7 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } -void init_microcode_amd(struct device *device) -{ - const char *fw_name = "amd-ucode/microcode_amd.bin"; - struct cpuinfo_x86 *c = &boot_cpu_data; - - WARN_ON(c->x86_vendor != X86_VENDOR_AMD); - - if (c->x86 < 0x10) { - pr_warning("AMD CPU family 0x%x not supported\n", c->x86); - return; - } - supported_cpu = 1; - - if (request_firmware(&firmware, fw_name, device)) - pr_err("failed to load file %s\n", fw_name); -} - -void fini_microcode_amd(void) -{ - release_firmware(firmware); -} - static struct microcode_ops microcode_amd_ops = { - .init = init_microcode_amd, - .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 0c8632433090..cceb5bc3c3c2 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -521,9 +521,6 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } - if (microcode_ops->init) - microcode_ops->init(µcode_pdev->dev); - get_online_cpus(); mutex_lock(µcode_mutex); @@ -566,9 +563,6 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); - if (microcode_ops->fini) - microcode_ops->fini(); - microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -- cgit v1.2.2 From b160091802d4a76dd063facb09fcf10bf5d5d747 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Sat, 23 Jan 2010 18:27:47 -0800 Subject: x86: Remove "x86 CPU features in debugfs" (CONFIG_X86_CPU_DEBUG) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_X86_CPU_DEBUG, which provides some parsed versions of the x86 CPU configuration via debugfs, has caused boot failures on real hardware. The value of this feature has been marginal at best, as all this information is already available to userspace via generic interfaces. Causes crashes that have not been fixed + minimal utility -> remove. See the referenced LKML thread for more information. Reported-by: Ozan Çağlayan <ozan@pardus.org.tr> Signed-off-by: H. Peter Anvin <hpa@zytor.com> LKML-Reference: <alpine.LFD.2.00.1001221755320.13231@localhost.localdomain> Cc: Jaswinder Singh Rajput <jaswinder@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Rafael J. Wysocki <rjw@sisk.pl> Cc: Yinghai Lu <yinghai@kernel.org> Cc: <stable@kernel.org> --- arch/x86/kernel/cpu/Makefile | 2 - arch/x86/kernel/cpu/cpu_debug.c | 688 ---------------------------------------- 2 files changed, 690 deletions(-) delete mode 100644 arch/x86/kernel/cpu/cpu_debug.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1d2cb383410e..c202b62f3671 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o -obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c deleted file mode 100644 index b368cd862997..000000000000 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * CPU x86 architecture debug code - * - * Copyright(C) 2009 Jaswinder Singh Rajput - * - * For licencing details see kernel-base/COPYING - */ - -#include <linux/interrupt.h> -#include <linux/compiler.h> -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/smp.h> - -#include <asm/cpu_debug.h> -#include <asm/paravirt.h> -#include <asm/system.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/desc.h> - -static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr); -static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr); -static DEFINE_PER_CPU(int, cpud_priv_count); - -static DEFINE_MUTEX(cpu_debug_lock); - -static struct dentry *cpu_debugfs_dir; - -static struct cpu_debug_base cpu_base[] = { - { "mc", CPU_MC, 0 }, - { "monitor", CPU_MONITOR, 0 }, - { "time", CPU_TIME, 0 }, - { "pmc", CPU_PMC, 1 }, - { "platform", CPU_PLATFORM, 0 }, - { "apic", CPU_APIC, 0 }, - { "poweron", CPU_POWERON, 0 }, - { "control", CPU_CONTROL, 0 }, - { "features", CPU_FEATURES, 0 }, - { "lastbranch", CPU_LBRANCH, 0 }, - { "bios", CPU_BIOS, 0 }, - { "freq", CPU_FREQ, 0 }, - { "mtrr", CPU_MTRR, 0 }, - { "perf", CPU_PERF, 0 }, - { "cache", CPU_CACHE, 0 }, - { "sysenter", CPU_SYSENTER, 0 }, - { "therm", CPU_THERM, 0 }, - { "misc", CPU_MISC, 0 }, - { "debug", CPU_DEBUG, 0 }, - { "pat", CPU_PAT, 0 }, - { "vmx", CPU_VMX, 0 }, - { "call", CPU_CALL, 0 }, - { "base", CPU_BASE, 0 }, - { "ver", CPU_VER, 0 }, - { "conf", CPU_CONF, 0 }, - { "smm", CPU_SMM, 0 }, - { "svm", CPU_SVM, 0 }, - { "osvm", CPU_OSVM, 0 }, - { "tss", CPU_TSS, 0 }, - { "cr", CPU_CR, 0 }, - { "dt", CPU_DT, 0 }, - { "registers", CPU_REG_ALL, 0 }, -}; - -static struct cpu_file_base cpu_file[] = { - { "index", CPU_REG_ALL, 0 }, - { "value", CPU_REG_ALL, 1 }, -}; - -/* CPU Registers Range */ -static struct cpu_debug_range cpu_reg_range[] = { - { 0x00000000, 0x00000001, CPU_MC, }, - { 0x00000006, 0x00000007, CPU_MONITOR, }, - { 0x00000010, 0x00000010, CPU_TIME, }, - { 0x00000011, 0x00000013, CPU_PMC, }, - { 0x00000017, 0x00000017, CPU_PLATFORM, }, - { 0x0000001B, 0x0000001B, CPU_APIC, }, - { 0x0000002A, 0x0000002B, CPU_POWERON, }, - { 0x0000002C, 0x0000002C, CPU_FREQ, }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, }, - { 0x00000040, 0x00000047, CPU_LBRANCH, }, - { 0x00000060, 0x00000067, CPU_LBRANCH, }, - { 0x00000079, 0x00000079, CPU_BIOS, }, - { 0x00000088, 0x0000008A, CPU_CACHE, }, - { 0x0000008B, 0x0000008B, CPU_BIOS, }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, }, - { 0x000000C1, 0x000000C4, CPU_PMC, }, - { 0x000000CD, 0x000000CD, CPU_FREQ, }, - { 0x000000E7, 0x000000E8, CPU_PERF, }, - { 0x000000FE, 0x000000FE, CPU_MTRR, }, - - { 0x00000116, 0x0000011E, CPU_CACHE, }, - { 0x00000174, 0x00000176, CPU_SYSENTER, }, - { 0x00000179, 0x0000017B, CPU_MC, }, - { 0x00000186, 0x00000189, CPU_PMC, }, - { 0x00000198, 0x00000199, CPU_PERF, }, - { 0x0000019A, 0x0000019A, CPU_TIME, }, - { 0x0000019B, 0x0000019D, CPU_THERM, }, - { 0x000001A0, 0x000001A0, CPU_MISC, }, - { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, }, - { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, }, - { 0x00000250, 0x00000250, CPU_MTRR, }, - { 0x00000258, 0x00000259, CPU_MTRR, }, - { 0x00000268, 0x0000026F, CPU_MTRR, }, - { 0x00000277, 0x00000277, CPU_PAT, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, }, - - { 0x00000300, 0x00000311, CPU_PMC, }, - { 0x00000345, 0x00000345, CPU_PMC, }, - { 0x00000360, 0x00000371, CPU_PMC, }, - { 0x0000038D, 0x00000390, CPU_PMC, }, - { 0x000003A0, 0x000003BE, CPU_PMC, }, - { 0x000003C0, 0x000003CD, CPU_PMC, }, - { 0x000003E0, 0x000003E1, CPU_PMC, }, - { 0x000003F0, 0x000003F2, CPU_PMC, }, - - { 0x00000400, 0x00000417, CPU_MC, }, - { 0x00000480, 0x0000048B, CPU_VMX, }, - - { 0x00000600, 0x00000600, CPU_DEBUG, }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, - - { 0x000107CC, 0x000107D3, CPU_PMC, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, }, - { 0xC0000081, 0xC0000084, CPU_CALL, }, - { 0xC0000100, 0xC0000102, CPU_BASE, }, - { 0xC0000103, 0xC0000103, CPU_TIME, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, }, - { 0xC0010010, 0xC0010010, CPU_CONF, }, - { 0xC0010015, 0xC0010015, CPU_CONF, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, }, - { 0xC001001F, 0xC001001F, CPU_CONF, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, }, - { 0xC0010044, 0xC0010048, CPU_MC, }, - { 0xC0010050, 0xC0010056, CPU_SMM, }, - { 0xC0010058, 0xC0010058, CPU_CONF, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, }, - { 0xC0010061, 0xC0010068, CPU_SMM, }, - { 0xC0010069, 0xC001006B, CPU_SMM, }, - { 0xC0010070, 0xC0010071, CPU_SMM, }, - { 0xC0010111, 0xC0010113, CPU_SMM, }, - { 0xC0010114, 0xC0010118, CPU_SVM, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, }, - { 0xC0011022, 0xC0011023, CPU_CONF, }, -}; - -static int is_typeflag_valid(unsigned cpu, unsigned flag) -{ - int i; - - /* Standard Registers should be always valid */ - if (flag >= CPU_TSS) - return 1; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (cpu_reg_range[i].flag == flag) - return 1; - } - - /* Invalid */ - return 0; -} - -static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, - int index, unsigned flag) -{ - if (cpu_reg_range[index].flag == flag) { - *min = cpu_reg_range[index].min; - *max = cpu_reg_range[index].max; - } else - *max = 0; - - return *max; -} - -/* This function can also be called with seq = NULL for printk */ -static void print_cpu_data(struct seq_file *seq, unsigned type, - u32 low, u32 high) -{ - struct cpu_private *priv; - u64 val = high; - - if (seq) { - priv = seq->private; - if (priv->file) { - val = (val << 32) | low; - seq_printf(seq, "0x%llx\n", val); - } else - seq_printf(seq, " %08x: %08x_%08x\n", - type, high, low); - } else - printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); -} - -/* This function can also be called with seq = NULL for printk */ -static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) -{ - unsigned msr, msr_min, msr_max; - struct cpu_private *priv; - u32 low, high; - int i; - - if (seq) { - priv = seq->private; - if (priv->file) { - if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, - &low, &high)) - print_cpu_data(seq, priv->reg, low, high); - return; - } - } - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) - continue; - - for (msr = msr_min; msr <= msr_max; msr++) { - if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) - continue; - print_cpu_data(seq, msr, low, high); - } - } -} - -static void print_tss(void *arg) -{ - struct pt_regs *regs = task_pt_regs(current); - struct seq_file *seq = arg; - unsigned int seg; - - seq_printf(seq, " RAX\t: %016lx\n", regs->ax); - seq_printf(seq, " RBX\t: %016lx\n", regs->bx); - seq_printf(seq, " RCX\t: %016lx\n", regs->cx); - seq_printf(seq, " RDX\t: %016lx\n", regs->dx); - - seq_printf(seq, " RSI\t: %016lx\n", regs->si); - seq_printf(seq, " RDI\t: %016lx\n", regs->di); - seq_printf(seq, " RBP\t: %016lx\n", regs->bp); - seq_printf(seq, " ESP\t: %016lx\n", regs->sp); - -#ifdef CONFIG_X86_64 - seq_printf(seq, " R08\t: %016lx\n", regs->r8); - seq_printf(seq, " R09\t: %016lx\n", regs->r9); - seq_printf(seq, " R10\t: %016lx\n", regs->r10); - seq_printf(seq, " R11\t: %016lx\n", regs->r11); - seq_printf(seq, " R12\t: %016lx\n", regs->r12); - seq_printf(seq, " R13\t: %016lx\n", regs->r13); - seq_printf(seq, " R14\t: %016lx\n", regs->r14); - seq_printf(seq, " R15\t: %016lx\n", regs->r15); -#endif - - asm("movl %%cs,%0" : "=r" (seg)); - seq_printf(seq, " CS\t: %04x\n", seg); - asm("movl %%ds,%0" : "=r" (seg)); - seq_printf(seq, " DS\t: %04x\n", seg); - seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); - asm("movl %%es,%0" : "=r" (seg)); - seq_printf(seq, " ES\t: %04x\n", seg); - asm("movl %%fs,%0" : "=r" (seg)); - seq_printf(seq, " FS\t: %04x\n", seg); - asm("movl %%gs,%0" : "=r" (seg)); - seq_printf(seq, " GS\t: %04x\n", seg); - - seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); - - seq_printf(seq, " EIP\t: %016lx\n", regs->ip); -} - -static void print_cr(void *arg) -{ - struct seq_file *seq = arg; - - seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); - seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); - seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); - seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); -#ifdef CONFIG_X86_64 - seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); -#endif -} - -static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) -{ - seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); -} - -static void print_dt(void *seq) -{ - struct desc_ptr dt; - unsigned long ldt; - - /* IDT */ - store_idt((struct desc_ptr *)&dt); - print_desc_ptr("IDT", seq, dt); - - /* GDT */ - store_gdt((struct desc_ptr *)&dt); - print_desc_ptr("GDT", seq, dt); - - /* LDT */ - store_ldt(ldt); - seq_printf(seq, " LDT\t: %016lx\n", ldt); - - /* TR */ - store_tr(ldt); - seq_printf(seq, " TR\t: %016lx\n", ldt); -} - -static void print_dr(void *arg) -{ - struct seq_file *seq = arg; - unsigned long dr; - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - get_debugreg(dr, i); - seq_printf(seq, " dr%d\t: %016lx\n", i, dr); - } - - seq_printf(seq, "\n MSR\t:\n"); -} - -static void print_apic(void *arg) -{ - struct seq_file *seq = arg; - -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(seq, " LAPIC\t:\n"); - seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); - seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); - seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); - seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); - seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); - seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); - seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); - seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); - seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); - seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); - seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); - seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); - seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); - seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); - seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); - seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); - seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); - seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); - seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); - seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); - seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); - if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { - unsigned int i, v, maxeilvt; - - v = apic_read(APIC_EFEAT); - maxeilvt = (v >> 16) & 0xff; - seq_printf(seq, " EFEAT\t\t: %08x\n", v); - seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); - - for (i = 0; i < maxeilvt; i++) { - v = apic_read(APIC_EILVTn(i)); - seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); - } - } -#endif /* CONFIG_X86_LOCAL_APIC */ - seq_printf(seq, "\n MSR\t:\n"); -} - -static int cpu_seq_show(struct seq_file *seq, void *v) -{ - struct cpu_private *priv = seq->private; - - if (priv == NULL) - return -EINVAL; - - switch (cpu_base[priv->type].flag) { - case CPU_TSS: - smp_call_function_single(priv->cpu, print_tss, seq, 1); - break; - case CPU_CR: - smp_call_function_single(priv->cpu, print_cr, seq, 1); - break; - case CPU_DT: - smp_call_function_single(priv->cpu, print_dt, seq, 1); - break; - case CPU_DEBUG: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_dr, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - case CPU_APIC: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_apic, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - - default: - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - } - seq_printf(seq, "\n"); - - return 0; -} - -static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos == 0) /* One time is enough ;-) */ - return seq; - - return NULL; -} - -static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - (*pos)++; - - return cpu_seq_start(seq, pos); -} - -static void cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations cpu_seq_ops = { - .start = cpu_seq_start, - .next = cpu_seq_next, - .stop = cpu_seq_stop, - .show = cpu_seq_show, -}; - -static int cpu_seq_open(struct inode *inode, struct file *file) -{ - struct cpu_private *priv = inode->i_private; - struct seq_file *seq; - int err; - - err = seq_open(file, &cpu_seq_ops); - if (!err) { - seq = file->private_data; - seq->private = priv; - } - - return err; -} - -static int write_msr(struct cpu_private *priv, u64 val) -{ - u32 low, high; - - high = (val >> 32) & 0xffffffff; - low = val & 0xffffffff; - - if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) - return 0; - - return -EPERM; -} - -static int write_cpu_register(struct cpu_private *priv, const char *buf) -{ - int ret = -EPERM; - u64 val; - - ret = strict_strtoull(buf, 0, &val); - if (ret < 0) - return ret; - - /* Supporting only MSRs */ - if (priv->type < CPU_TSS_BIT) - return write_msr(priv, val); - - return ret; -} - -static ssize_t cpu_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct cpu_private *priv = seq->private; - char buf[19]; - - if ((priv == NULL) || (count >= sizeof(buf))) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) - if (!write_cpu_register(priv, buf)) - return count; - - return -EACCES; -} - -static const struct file_operations cpu_fops = { - .owner = THIS_MODULE, - .open = cpu_seq_open, - .read = seq_read, - .write = cpu_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, - unsigned file, struct dentry *dentry) -{ - struct cpu_private *priv = NULL; - - /* Already intialized */ - if (file == CPU_INDEX_BIT) - if (per_cpu(cpud_arr[type].init, cpu)) - return 0; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv == NULL) - return -ENOMEM; - - priv->cpu = cpu; - priv->type = type; - priv->reg = reg; - priv->file = file; - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_priv_arr[type], cpu) = priv; - per_cpu(cpud_priv_count, cpu)++; - mutex_unlock(&cpu_debug_lock); - - if (file) - debugfs_create_file(cpu_file[file].name, S_IRUGO, - dentry, (void *)priv, &cpu_fops); - else { - debugfs_create_file(cpu_base[type].name, S_IRUGO, - per_cpu(cpud_arr[type].dentry, cpu), - (void *)priv, &cpu_fops); - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_arr[type].init, cpu) = 1; - mutex_unlock(&cpu_debug_lock); - } - - return 0; -} - -static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, - struct dentry *dentry) -{ - unsigned file; - int err = 0; - - for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { - err = cpu_create_file(cpu, type, reg, file, dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned reg, reg_min, reg_max; - int i, err = 0; - char reg_dir[12]; - u32 low, high; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, ®_min, ®_max, i, - cpu_base[type].flag)) - continue; - - for (reg = reg_min; reg <= reg_max; reg++) { - if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) - continue; - - sprintf(reg_dir, "0x%x", reg); - cpu_dentry = debugfs_create_dir(reg_dir, dentry); - err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); - if (err) - return err; - } - } - - return err; -} - -static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned type; - int err = 0; - - for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { - if (!is_typeflag_valid(cpu, cpu_base[type].flag)) - continue; - cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); - per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry; - - if (type < CPU_TSS_BIT) - err = cpu_init_msr(cpu, type, cpu_dentry); - else - err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, - cpu_dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_cpu(void) -{ - struct dentry *cpu_dentry = NULL; - struct cpuinfo_x86 *cpui; - char cpu_dir[12]; - unsigned cpu; - int err = 0; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - cpui = &cpu_data(cpu); - if (!cpu_has(cpui, X86_FEATURE_MSR)) - continue; - - sprintf(cpu_dir, "cpu%d", cpu); - cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); - err = cpu_init_allreg(cpu, cpu_dentry); - - pr_info("cpu%d(%d) debug files %d\n", - cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu)); - if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) { - pr_err("Register files count %d exceeds limit %d\n", - per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES); - per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES; - err = -ENFILE; - } - if (err) - return err; - } - - return err; -} - -static int __init cpu_debug_init(void) -{ - cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); - - return cpu_init_cpu(); -} - -static void __exit cpu_debug_exit(void) -{ - int i, cpu; - - if (cpu_debugfs_dir) - debugfs_remove_recursive(cpu_debugfs_dir); - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++) - kfree(per_cpu(cpud_priv_arr[i], cpu)); -} - -module_init(cpu_debug_init); -module_exit(cpu_debug_exit); - -MODULE_AUTHOR("Jaswinder Singh Rajput"); -MODULE_DESCRIPTION("CPU Debug module"); -MODULE_LICENSE("GPL"); -- cgit v1.2.2 From da482474b8396e1a099c37ffc6541b78775aedb4 Mon Sep 17 00:00:00 2001 From: Russ Anderson <rja@sgi.com> Date: Tue, 26 Jan 2010 20:37:22 -0600 Subject: x86, msr/cpuid: Pass the number of minors when unregistering MSR and CPUID drivers. Pass the number of minors when unregistering MSR and CPUID drivers. Reported-by: Dean Nelson <dnelson@redhat.com> Signed-off-by: Dean Nelson <dnelson@redhat.com> LKML-Reference: <20100127023722.GA22305@sgi.com> Signed-off-by: Russ Anderson <rja@sgi.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpuid.c | 2 +- arch/x86/kernel/msr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index cb27fd6136c9..83e5e628de73 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -229,7 +229,7 @@ static void __exit cpuid_exit(void) for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4bd93c9b2b27..206735ac8cbd 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -285,7 +285,7 @@ static void __exit msr_exit(void) for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); } -- cgit v1.2.2 From aca3bb5910119d4cf6c28568a642582efb4cc14a Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich <sivanich@sgi.com> Date: Fri, 22 Jan 2010 09:41:40 -0600 Subject: x86, UV: Fix RTC latency bug by reading replicated cachelines For SGI UV node controllers (HUB) rev 2.0 or greater, use replicated cachelines to read the RTC timer. This optimization allows faster simulataneous reads from a given socket. Signed-off-by: Dimitri Sivanich <sivanich@sgi.com> Cc: Jack Steiner <steiner@sgi.com> LKML-Reference: <20100122154140.GB4975@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/uv_time.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 3c84aa001c11..2b75ef638dbc 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -282,10 +282,21 @@ static int uv_rtc_unset_timer(int cpu, int force) /* * Read the RTC. + * + * Starting with HUB rev 2.0, the UV RTC register is replicated across all + * cachelines of it's own page. This allows faster simultaneous reads + * from a given socket. */ static cycle_t uv_read_rtc(struct clocksource *cs) { - return (cycle_t)uv_read_local_mmr(UVH_RTC); + unsigned long offset; + + if (uv_get_min_hub_revision_id() == 1) + offset = 0; + else + offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; + + return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); } /* -- cgit v1.2.2 From 35ea63d70f827a26c150993b4b940925bb02b03f Mon Sep 17 00:00:00 2001 From: Leann Ogasawara <leann.ogasawara@canonical.com> Date: Wed, 27 Jan 2010 15:29:18 -0800 Subject: x86: Add Dell OptiPlex 760 reboot quirk Dell OptiPlex 760 hangs on reboot unless reboot=bios is used. Add quirk to reboot through the BIOS. BugLink: https://bugs.launchpad.net/bugs/488319 Signed-off-by: Leann Ogasawara <leann.ogasawara@canonical.com> LKML-Reference: <1264634958.27335.1091.camel@emiko> Cc: <stable@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/reboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1545bc0c9845..704bddcdf64d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", -- cgit v1.2.2 From 439913fffd39374c3737186b22d2d56c3a0ae526 Mon Sep 17 00:00:00 2001 From: Lin Ming <ming.m.lin@intel.com> Date: Thu, 28 Jan 2010 10:53:19 +0800 Subject: ACPI: replace acpi_integer by u64 acpi_integer is now obsolete and removed from the ACPICA code base, replaced by u64. Signed-off-by: Lin Ming <ming.m.lin@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f125e5c551c0..55d42bc443e8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data) static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - acpi_integer control; + u64 control; if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; @@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val = -ENODEV; - acpi_integer control, status; + u64 control, status; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); @@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, u32 fid; u32 vid; u32 freq, index; - acpi_integer status, control; + u64 status, control; if (data->exttype) { status = data->acpi_data.states[i].status; -- cgit v1.2.2 From 339ce1a4dc2ca26444c4f65c31b71a5056f3bb0b Mon Sep 17 00:00:00 2001 From: Anton Blanchard <anton@samba.org> Date: Mon, 18 Jan 2010 16:47:07 +1100 Subject: perf: Fix inconsistency between IP and callchain sampling When running perf across all cpus with backtracing (-a -g), sometimes we get samples without associated backtraces: 23.44% init [kernel] [k] restore 11.46% init eeba0c [k] 0x00000000eeba0c 6.77% swapper [kernel] [k] .perf_ctx_adjust_freq 5.73% init [kernel] [k] .__trace_hcall_entry 4.69% perf libc-2.9.so [.] 0x0000000006bb8c | |--11.11%-- 0xfffa941bbbc It turns out the backtrace code has a check for the idle task and the IP sampling does not. This creates problems when profiling an interrupt heavy workload (in my case 10Gbit ethernet) since we get no backtraces for interrupts received while idle (ie most of the workload). Right now x86 and sh check that current is not NULL, which should never happen so remove that too. Idle task's exclusion must be performed from the core code, on top of perf_event_attr:exclude_idle. Signed-off-by: Anton Blanchard <anton@samba.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Paul Mackerras <paulus@samba.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> LKML-Reference: <20100118054707.GT12666@kryten> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> --- arch/x86/kernel/cpu/perf_event.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b1bb8c550526..ed1998b28a7c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2425,9 +2425,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) is_user = user_mode(regs); - if (!current || current->pid == 0) - return; - if (is_user && current->state != TASK_RUNNING) return; -- cgit v1.2.2 From 40f9249a73f6c251adea492b1c3d19d39e2a9bda Mon Sep 17 00:00:00 2001 From: "K.Prasad" <prasad@linux.vnet.ibm.com> Date: Thu, 28 Jan 2010 16:44:01 +0530 Subject: x86/debug: Clear reserved bits of DR6 in do_debug() Clear the reserved bits from the stored copy of debug status register (DR6). This will help easy bitwise operations such as quick testing of a debug event origin. Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> Cc: Roland McGrath <roland@redhat.com> Cc: Jan Kiszka <jan.kiszka@siemens.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: Ingo Molnar <mingo@elte.hu> LKML-Reference: <20100128111401.GB13935@in.ibm.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> --- arch/x86/kernel/traps.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 33399176512a..1168e4454188 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) get_debugreg(dr6, 6); + /* Filter out all the reserved bits which are preset to 1 */ + dr6 &= ~DR6_RESERVED; + /* Catch kmemcheck conditions first of all! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; -- cgit v1.2.2 From e0e53db6133c32964fd17f20b17073a402f07ed3 Mon Sep 17 00:00:00 2001 From: "K.Prasad" <prasad@linux.vnet.ibm.com> Date: Thu, 28 Jan 2010 16:44:15 +0530 Subject: x86/hw-breakpoints: Optimize return code from notifier chain in hw_breakpoint_handler Processing of debug exceptions in do_debug() can stop if it originated from a hw-breakpoint exception by returning NOTIFY_STOP in most cases. But for certain cases such as: a) user-space breakpoints with pending SIGTRAP signal delivery (as in the case of ptrace induced breakpoints). b) exceptions due to other causes than breakpoints We will continue to process the exception by returning NOTIFY_DONE. Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Roland McGrath <roland@redhat.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: Jan Kiszka <jan.kiszka@siemens.com> LKML-Reference: <20100128111415.GC13935@in.ibm.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> --- arch/x86/kernel/hw_breakpoint.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 05d5fec64a94..ae90b4739435 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -502,8 +502,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) rcu_read_lock(); bp = per_cpu(bp_per_reg[i], cpu); - if (bp) - rc = NOTIFY_DONE; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -522,7 +520,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) rcu_read_unlock(); } - if (dr6 & (~DR_TRAP_BITS)) + /* + * Further processing in do_debug() is needed for a) user-space + * breakpoints (to generate signals) and b) when the system has + * taken exception due to multiple causes + */ + if ((current->thread.debugreg6 & DR_TRAP_BITS) || + (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; set_debugreg(dr7, 7); -- cgit v1.2.2 From 1da53e023029c067ba1277a33038c65d6e4c99b3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Mon, 18 Jan 2010 10:58:01 +0200 Subject: perf_events, x86: Improve x86 event scheduling This patch improves event scheduling by maximizing the use of PMU registers regardless of the order in which events are created in a group. The algorithm takes into account the list of counter constraints for each event. It assigns events to counters from the most constrained, i.e., works on only one counter, to the least constrained, i.e., works on any counter. Intel Fixed counter events and the BTS special event are also handled via this algorithm which is designed to be fairly generic. The patch also updates the validation of an event to use the scheduling algorithm. This will cause early failure in perf_event_open(). The 2nd version of this patch follows the model used by PPC, by running the scheduling algorithm and the actual assignment separately. Actual assignment takes place in hw_perf_enable() whereas scheduling is implemented in hw_perf_group_sched_in() and x86_pmu_enable(). Signed-off-by: Stephane Eranian <eranian@google.com> [ fixup whitespace and style nits as well as adding is_x86_event() ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> LKML-Reference: <4b5430c6.0f975e0a.1bf9.ffff85fe@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 775 ++++++++++++++++++++++++++++----------- 1 file changed, 560 insertions(+), 215 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ed1998b28a7c..995ac4ae379c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -7,6 +7,7 @@ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ @@ -68,26 +69,37 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64)) + +struct event_constraint { + u64 idxmsk[BITS_TO_U64(X86_PMC_IDX_MAX)]; + int code; + int cmask; +}; + struct cpu_hw_events { - struct perf_event *events[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; struct debug_store *ds; -}; -struct event_constraint { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int code; + int n_events; + int n_added; + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } -#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } +#define EVENT_CONSTRAINT(c, n, m) { \ + .code = (c), \ + .cmask = (m), \ + .idxmsk[0] = (n) } -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->idxmsk[0]; (e)++) +#define EVENT_CONSTRAINT_END \ + { .code = 0, .cmask = 0, .idxmsk[0] = 0 } +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -114,8 +126,9 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - int (*get_event_idx)(struct cpu_hw_events *cpuc, - struct hw_perf_event *hwc); + void (*get_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event, u64 *idxmsk); + void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); + const struct event_constraint *event_constraints; }; static struct x86_pmu x86_pmu __read_mostly; @@ -124,7 +137,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static const struct event_constraint *event_constraints; +static int x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx); /* * Not sure about some of these @@ -171,14 +185,14 @@ static u64 p6_pmu_raw_event(u64 hw_event) return hw_event & P6_EVNTSEL_MASK; } -static const struct event_constraint intel_p6_event_constraints[] = +static struct event_constraint intel_p6_event_constraints[] = { - EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0xc1, 0x1, INTEL_ARCH_EVENT_MASK), /* FLOPS */ + EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */ EVENT_CONSTRAINT_END }; @@ -196,32 +210,43 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static const struct event_constraint intel_core_event_constraints[] = -{ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ - EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ - EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ +static struct event_constraint intel_core_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ + EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ + EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x2, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0x18, 0x1, INTEL_ARCH_EVENT_MASK), /* IDLE_DURING_DIV */ + EVENT_CONSTRAINT(0x19, 0x2, INTEL_ARCH_EVENT_MASK), /* DELAYED_BYPASS */ + EVENT_CONSTRAINT(0xa1, 0x1, INTEL_ARCH_EVENT_MASK), /* RS_UOPS_DISPATCH_CYCLES */ + EVENT_CONSTRAINT(0xcb, 0x1, INTEL_ARCH_EVENT_MASK), /* MEM_LOAD_RETIRED */ EVENT_CONSTRAINT_END }; -static const struct event_constraint intel_nehalem_event_constraints[] = -{ - EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ - EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ - EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ - EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ - EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ - EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ +static struct event_constraint intel_nehalem_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ + EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ + EVENT_CONSTRAINT(0x40, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LD */ + EVENT_CONSTRAINT(0x41, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_ST */ + EVENT_CONSTRAINT(0x42, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK */ + EVENT_CONSTRAINT(0x43, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_ALL_REF */ + EVENT_CONSTRAINT(0x4e, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_PREFETCH */ + EVENT_CONSTRAINT(0x4c, 0x3, INTEL_ARCH_EVENT_MASK), /* LOAD_HIT_PRE */ + EVENT_CONSTRAINT(0x51, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D */ + EVENT_CONSTRAINT(0x52, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0x53, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0xc5, 0x3, INTEL_ARCH_EVENT_MASK), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ + EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ EVENT_CONSTRAINT_END }; @@ -527,11 +552,11 @@ static u64 intel_pmu_raw_event(u64 hw_event) #define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_REG_MASK) + (INTEL_ARCH_EVTSEL_MASK | \ + INTEL_ARCH_UNIT_MASK | \ + INTEL_ARCH_EDGE_MASK | \ + INTEL_ARCH_INV_MASK | \ + INTEL_ARCH_CNT_MASK) return hw_event & CORE_EVNTSEL_MASK; } @@ -1120,9 +1145,15 @@ static void amd_pmu_disable_all(void) void hw_perf_disable(void) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + if (!x86_pmu_initialized()) return; - return x86_pmu.disable_all(); + + if (cpuc->enabled) + cpuc->n_added = 0; + + x86_pmu.disable_all(); } static void p6_pmu_enable_all(void) @@ -1189,10 +1220,237 @@ static void amd_pmu_enable_all(void) } } +static const struct pmu pmu; + +static inline int is_x86_event(struct perf_event *event) +{ + return event->pmu == &pmu; +} + +static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +{ + int i, j , w, num; + int weight, wmax; + unsigned long *c; + u64 constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + + for (i = 0; i < n; i++) { + x86_pmu.get_event_constraints(cpuc, + cpuc->event_list[i], + constraints[i]); + } + + /* + * weight = number of possible counters + * + * 1 = most constrained, only works on one counter + * wmax = least constrained, works on any counter + * + * assign events to counters starting with most + * constrained events. + */ + wmax = x86_pmu.num_events; + + /* + * when fixed event counters are present, + * wmax is incremented by 1 to account + * for one more choice + */ + if (x86_pmu.num_events_fixed) + wmax++; + + num = n; + for (w = 1; num && w <= wmax; w++) { + /* for each event */ + for (i = 0; i < n; i++) { + c = (unsigned long *)constraints[i]; + hwc = &cpuc->event_list[i]->hw; + + weight = bitmap_weight(c, X86_PMC_IDX_MAX); + if (weight != w) + continue; + + /* + * try to reuse previous assignment + * + * This is possible despite the fact that + * events or events order may have changed. + * + * What matters is the level of constraints + * of an event and this is constant for now. + * + * This is possible also because we always + * scan from most to least constrained. Thus, + * if a counter can be reused, it means no, + * more constrained events, needed it. And + * next events will either compete for it + * (which cannot be solved anyway) or they + * have fewer constraints, and they can use + * another counter. + */ + j = hwc->idx; + if (j != -1 && !test_bit(j, used_mask)) + goto skip; + + for_each_bit(j, c, X86_PMC_IDX_MAX) { + if (!test_bit(j, used_mask)) + break; + } + + if (j == X86_PMC_IDX_MAX) + break; +skip: + set_bit(j, used_mask); + +#if 0 + pr_debug("CPU%d config=0x%llx idx=%d assign=%c\n", + smp_processor_id(), + hwc->config, + j, + assign ? 'y' : 'n'); +#endif + + if (assign) + assign[i] = j; + num--; + } + } + /* + * scheduling failed or is just a simulation, + * free resources if necessary + */ + if (!assign || num) { + for (i = 0; i < n; i++) { + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + } + } + return num ? -ENOSPC : 0; +} + +/* + * dogrp: true if must collect siblings events (group) + * returns total number of events and error code + */ +static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; + + /* current number of events already accepted */ + n = cpuc->n_events; + + if (is_x86_event(leader)) { + if (n >= max_count) + return -ENOSPC; + cpuc->event_list[n] = leader; + n++; + } + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (!is_x86_event(event) || + event->state == PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -ENOSPC; + + cpuc->event_list[n] = event; + n++; + } + return n; +} + + +static inline void x86_assign_hw_event(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + hwc->idx = idx; + + if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + hwc->config_base = 0; + hwc->event_base = 0; + } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that event_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->event_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + } else { + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; + } +} + void hw_perf_enable(void) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + struct hw_perf_event *hwc; + int i; + if (!x86_pmu_initialized()) return; + if (cpuc->n_added) { + /* + * apply assignment obtained either from + * hw_perf_group_sched_in() or x86_pmu_enable() + * + * step1: save events moving to new counters + * step2: reprogram moved events into new counters + */ + for (i = 0; i < cpuc->n_events; i++) { + + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == -1 || hwc->idx == cpuc->assign[i]) + continue; + + x86_pmu.disable(hwc, hwc->idx); + + clear_bit(hwc->idx, cpuc->active_mask); + barrier(); + cpuc->events[hwc->idx] = NULL; + + x86_perf_event_update(event, hwc, hwc->idx); + + hwc->idx = -1; + } + + for (i = 0; i < cpuc->n_events; i++) { + + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == -1) { + x86_assign_hw_event(event, hwc, cpuc->assign[i]); + x86_perf_event_set_period(event, hwc, hwc->idx); + } + /* + * need to mark as active because x86_pmu_disable() + * clear active_mask and eventsp[] yet it preserves + * idx + */ + set_bit(hwc->idx, cpuc->active_mask); + cpuc->events[hwc->idx] = event; + + x86_pmu.enable(hwc, hwc->idx); + perf_event_update_userpage(event); + } + cpuc->n_added = 0; + perf_events_lapic_init(); + } x86_pmu.enable_all(); } @@ -1391,148 +1649,43 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) x86_pmu_enable_event(hwc, idx); } -static int fixed_mode_idx(struct hw_perf_event *hwc) -{ - unsigned int hw_event; - - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; - - if (!x86_pmu.num_events_fixed) - return -1; - - /* - * fixed counters do not take all possible filters - */ - if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) - return -1; - - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; - - return -1; -} - -/* - * generic counter allocator: get next free counter - */ -static int -gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); - return idx == x86_pmu.num_events ? -1 : idx; -} - /* - * intel-specific counter allocator: check event constraints - */ -static int -intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - const struct event_constraint *event_constraint; - int i, code; - - if (!event_constraints) - goto skip; - - code = hwc->config & CORE_EVNTSEL_EVENT_MASK; - - for_each_event_constraint(event_constraint, event_constraints) { - if (code == event_constraint->code) { - for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { - if (!test_and_set_bit(i, cpuc->used_mask)) - return i; - } - return -1; - } - } -skip: - return gen_get_event_idx(cpuc, hwc); -} - -static int -x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = fixed_mode_idx(hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->event_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed event, if that is already taken - * then try to get a generic event: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; - - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic event again */ - if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = x86_pmu.get_event_idx(cpuc, hwc); - if (idx == -1) - return -EAGAIN; - - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; - } - - return idx; -} - -/* - * Find a PMC slot for the freshly enabled / scheduled in event: + * activate a single event + * + * The event is added to the group of enabled events + * but only if it can be scehduled with existing events. + * + * Called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() */ static int x86_pmu_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int idx; + struct hw_perf_event *hwc; + int assign[X86_PMC_IDX_MAX]; + int n, n0, ret; - idx = x86_schedule_event(cpuc, hwc); - if (idx < 0) - return idx; + hwc = &event->hw; - perf_events_lapic_init(); + n0 = cpuc->n_events; + n = collect_events(cpuc, event, false); + if (n < 0) + return n; - x86_pmu.disable(hwc, idx); - - cpuc->events[idx] = event; - set_bit(idx, cpuc->active_mask); + ret = x86_schedule_events(cpuc, n, assign); + if (ret) + return ret; + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n*sizeof(int)); - x86_perf_event_set_period(event, hwc, idx); - x86_pmu.enable(hwc, idx); + cpuc->n_events = n; + cpuc->n_added = n - n0; - perf_event_update_userpage(event); + if (hwc->idx != -1) + x86_perf_event_set_period(event, hwc, hwc->idx); return 0; } @@ -1576,7 +1729,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -1664,7 +1817,7 @@ static void x86_pmu_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; + int i, idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler @@ -1690,8 +1843,19 @@ static void x86_pmu_disable(struct perf_event *event) intel_pmu_drain_bts_buffer(cpuc); cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); + for (i = 0; i < cpuc->n_events; i++) { + if (event == cpuc->event_list[i]) { + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); + + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; + + --cpuc->n_events; + } + } perf_event_update_userpage(event); } @@ -1962,6 +2126,176 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } +static struct event_constraint bts_constraint = { + .code = 0, + .cmask = 0, + .idxmsk[0] = 1ULL << X86_PMC_IDX_FIXED_BTS +}; + +static int intel_special_constraints(struct perf_event *event, + u64 *idxmsk) +{ + unsigned int hw_event; + + hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (event->hw.sample_period == 1))) { + + bitmap_copy((unsigned long *)idxmsk, + (unsigned long *)bts_constraint.idxmsk, + X86_PMC_IDX_MAX); + return 1; + } + return 0; +} + +static void intel_get_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event, + u64 *idxmsk) +{ + const struct event_constraint *c; + + /* + * cleanup bitmask + */ + bitmap_zero((unsigned long *)idxmsk, X86_PMC_IDX_MAX); + + if (intel_special_constraints(event, idxmsk)) + return; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) { + + bitmap_copy((unsigned long *)idxmsk, + (unsigned long *)c->idxmsk, + X86_PMC_IDX_MAX); + return; + } + } + } + /* no constraints, means supports all generic counters */ + bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events); +} + +static void amd_get_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event, + u64 *idxmsk) +{ +} + +static int x86_event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, int cpu) +{ + int ret = 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; + event->tstamp_running += event->ctx->time - event->tstamp_stopped; + + if (!is_x86_event(event)) + ret = event->pmu->enable(event); + + if (!ret && !is_software_event(event)) + cpuctx->active_oncpu++; + + if (!ret && event->attr.exclusive) + cpuctx->exclusive = 1; + + return ret; +} + +static void x86_event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, int cpu) +{ + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + + if (!is_x86_event(event)) + event->pmu->disable(event); + + event->tstamp_running -= event->ctx->time - event->tstamp_stopped; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +/* + * Called to enable a whole group of events. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + * + * called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() + */ +int hw_perf_group_sched_in(struct perf_event *leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct perf_event *sub; + int assign[X86_PMC_IDX_MAX]; + int n0, n1, ret; + + /* n0 = total number of events */ + n0 = collect_events(cpuc, leader, true); + if (n0 < 0) + return n0; + + ret = x86_schedule_events(cpuc, n0, assign); + if (ret) + return ret; + + ret = x86_event_sched_in(leader, cpuctx, cpu); + if (ret) + return ret; + + n1 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state != PERF_EVENT_STATE_OFF) { + ret = x86_event_sched_in(sub, cpuctx, cpu); + if (ret) + goto undo; + ++n1; + } + } + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n0*sizeof(int)); + + cpuc->n_events = n0; + cpuc->n_added = n1; + ctx->nr_active += n1; + + /* + * 1 means successful and events are active + * This is not quite true because we defer + * actual activation until hw_perf_enable() but + * this way we* ensure caller won't try to enable + * individual events + */ + return 1; +undo: + x86_event_sched_out(leader, cpuctx, cpu); + n0 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + x86_event_sched_out(sub, cpuctx, cpu); + if (++n0 == n1) + break; + } + } + return ret; +} + static __read_mostly struct notifier_block perf_event_nmi_notifier = { .notifier_call = perf_event_nmi_handler, .next = NULL, @@ -1993,7 +2327,8 @@ static __initconst struct x86_pmu p6_pmu = { */ .event_bits = 32, .event_mask = (1ULL << 32) - 1, - .get_event_idx = intel_get_event_idx, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_p6_event_constraints }; static __initconst struct x86_pmu intel_pmu = { @@ -2017,7 +2352,7 @@ static __initconst struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, - .get_event_idx = intel_get_event_idx, + .get_event_constraints = intel_get_event_constraints }; static __initconst struct x86_pmu amd_pmu = { @@ -2038,7 +2373,7 @@ static __initconst struct x86_pmu amd_pmu = { .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_idx = gen_get_event_idx, + .get_event_constraints = amd_get_event_constraints }; static __init int p6_pmu_init(void) @@ -2051,12 +2386,9 @@ static __init int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ - event_constraints = intel_p6_event_constraints; - break; case 9: case 13: /* Pentium M */ - event_constraints = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -2121,23 +2453,29 @@ static __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + x86_pmu.event_constraints = intel_core_event_constraints; pr_cont("Core2 events, "); - event_constraints = intel_core_event_constraints; break; - default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - event_constraints = intel_nehalem_event_constraints; + x86_pmu.event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("Atom events, "); break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); } return 0; } @@ -2234,36 +2572,43 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; -static int -validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct hw_perf_event fake_event = event->hw; - - if (event->pmu && event->pmu != &pmu) - return 0; - - return x86_schedule_event(cpuc, &fake_event) >= 0; -} - +/* + * validate a single event group + * + * validation include: + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters + * + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ static int validate_group(struct perf_event *event) { - struct perf_event *sibling, *leader = event->group_leader; - struct cpu_hw_events fake_pmu; + struct perf_event *leader = event->group_leader; + struct cpu_hw_events fake_cpuc; + int n; - memset(&fake_pmu, 0, sizeof(fake_pmu)); + memset(&fake_cpuc, 0, sizeof(fake_cpuc)); - if (!validate_event(&fake_pmu, leader)) + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = collect_events(&fake_cpuc, leader, true); + if (n < 0) return -ENOSPC; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { - if (!validate_event(&fake_pmu, sibling)) - return -ENOSPC; - } - - if (!validate_event(&fake_pmu, event)) + fake_cpuc.n_events = n; + n = collect_events(&fake_cpuc, event, false); + if (n < 0) return -ENOSPC; - return 0; + fake_cpuc.n_events = n; + + return x86_schedule_events(&fake_cpuc, n, NULL); } const struct pmu *hw_perf_event_init(struct perf_event *event) -- cgit v1.2.2 From 8113070d6639d2245c6c79afb8df42cedab30540 Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Thu, 21 Jan 2010 17:39:01 +0200 Subject: perf_events: Add fast-path to the rescheduling code Implement correct fastpath scheduling, i.e., reuse previous assignment. Signed-off-by: Stephane Eranian <eranian@google.com> [ split from larger patch] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <4b588464.1818d00a.4456.383b@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 91 +++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 30 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 995ac4ae379c..0bd23d01af34 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1244,6 +1244,46 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) constraints[i]); } + /* + * fastpath, try to reuse previous register + */ + for (i = 0, num = n; i < n; i++, num--) { + hwc = &cpuc->event_list[i]->hw; + c = (unsigned long *)constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + +#if 0 + pr_debug("CPU%d fast config=0x%llx idx=%d assign=%c\n", + smp_processor_id(), + hwc->config, + hwc->idx, + assign ? 'y' : 'n'); +#endif + + set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + if (!num) + goto done; + + /* + * begin slow path + */ + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + /* * weight = number of possible counters * @@ -1263,10 +1303,9 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (x86_pmu.num_events_fixed) wmax++; - num = n; - for (w = 1; num && w <= wmax; w++) { + for (w = 1, num = n; num && w <= wmax; w++) { /* for each event */ - for (i = 0; i < n; i++) { + for (i = 0; num && i < n; i++) { c = (unsigned long *)constraints[i]; hwc = &cpuc->event_list[i]->hw; @@ -1274,28 +1313,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (weight != w) continue; - /* - * try to reuse previous assignment - * - * This is possible despite the fact that - * events or events order may have changed. - * - * What matters is the level of constraints - * of an event and this is constant for now. - * - * This is possible also because we always - * scan from most to least constrained. Thus, - * if a counter can be reused, it means no, - * more constrained events, needed it. And - * next events will either compete for it - * (which cannot be solved anyway) or they - * have fewer constraints, and they can use - * another counter. - */ - j = hwc->idx; - if (j != -1 && !test_bit(j, used_mask)) - goto skip; - for_each_bit(j, c, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; @@ -1303,22 +1320,23 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; -skip: - set_bit(j, used_mask); #if 0 - pr_debug("CPU%d config=0x%llx idx=%d assign=%c\n", + pr_debug("CPU%d slow config=0x%llx idx=%d assign=%c\n", smp_processor_id(), hwc->config, j, assign ? 'y' : 'n'); #endif + set_bit(j, used_mask); + if (assign) assign[i] = j; num--; } } +done: /* * scheduling failed or is just a simulation, * free resources if necessary @@ -1357,7 +1375,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, list_for_each_entry(event, &leader->sibling_list, group_entry) { if (!is_x86_event(event) || - event->state == PERF_EVENT_STATE_OFF) + event->state <= PERF_EVENT_STATE_OFF) continue; if (n >= max_count) @@ -2184,6 +2202,8 @@ static void amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, u64 *idxmsk) { + /* no constraints, means supports all generic counters */ + bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events); } static int x86_event_sched_in(struct perf_event *event, @@ -2258,7 +2278,7 @@ int hw_perf_group_sched_in(struct perf_event *leader, n1 = 1; list_for_each_entry(sub, &leader->sibling_list, group_entry) { - if (sub->state != PERF_EVENT_STATE_OFF) { + if (sub->state > PERF_EVENT_STATE_OFF) { ret = x86_event_sched_in(sub, cpuctx, cpu); if (ret) goto undo; @@ -2613,12 +2633,23 @@ static int validate_group(struct perf_event *event) const struct pmu *hw_perf_event_init(struct perf_event *event) { + const struct pmu *tmp; int err; err = __hw_perf_event_init(event); if (!err) { + /* + * we temporarily connect event to its pmu + * such that validate_group() can classify + * it as an x86 event using is_x86_event() + */ + tmp = event->pmu; + event->pmu = &pmu; + if (event->group_leader != event) err = validate_group(event); + + event->pmu = tmp; } if (err) { if (event->destroy) -- cgit v1.2.2 From 502568d563bcc37ac505a83341c0c95b88c015a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 22 Jan 2010 14:35:46 +0100 Subject: perf_event: x86: Allocate the fake_cpuc GCC was complaining the stack usage was too large, so allocate the structure. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100122155535.411197266@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0bd23d01af34..7bd359a57839 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2606,10 +2606,13 @@ static const struct pmu pmu = { static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; - struct cpu_hw_events fake_cpuc; - int n; + struct cpu_hw_events *fake_cpuc; + int ret, n; - memset(&fake_cpuc, 0, sizeof(fake_cpuc)); + ret = -ENOMEM; + fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); + if (!fake_cpuc) + goto out; /* * the event is not yet connected with its @@ -2617,18 +2620,24 @@ static int validate_group(struct perf_event *event) * existing siblings, then add the new event * before we can simulate the scheduling */ - n = collect_events(&fake_cpuc, leader, true); + ret = -ENOSPC; + n = collect_events(fake_cpuc, leader, true); if (n < 0) - return -ENOSPC; + goto out_free; - fake_cpuc.n_events = n; - n = collect_events(&fake_cpuc, event, false); + fake_cpuc->n_events = n; + n = collect_events(fake_cpuc, event, false); if (n < 0) - return -ENOSPC; + goto out_free; - fake_cpuc.n_events = n; + fake_cpuc->n_events = n; - return x86_schedule_events(&fake_cpuc, n, NULL); + ret = x86_schedule_events(fake_cpuc, n, NULL); + +out_free: + kfree(fake_cpuc); +out: + return ret; } const struct pmu *hw_perf_event_init(struct perf_event *event) -- cgit v1.2.2 From 81269a085669b5130058a0275aa7ba9f94abd1fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 22 Jan 2010 14:55:22 +0100 Subject: perf_event: x86: Fixup constraints typing issue Constraints gets defined an u64 but in long quantities and then cast to long. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100122155535.504916780@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 7bd359a57839..7e181a5097ea 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1232,7 +1232,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) int i, j , w, num; int weight, wmax; unsigned long *c; - u64 constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct hw_perf_event *hwc; @@ -1249,7 +1249,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0, num = n; i < n; i++, num--) { hwc = &cpuc->event_list[i]->hw; - c = (unsigned long *)constraints[i]; + c = constraints[i]; /* never assigned */ if (hwc->idx == -1) @@ -1306,7 +1306,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) for (w = 1, num = n; num && w <= wmax; w++) { /* for each event */ for (i = 0; num && i < n; i++) { - c = (unsigned long *)constraints[i]; + c = constraints[i]; hwc = &cpuc->event_list[i]->hw; weight = bitmap_weight(c, X86_PMC_IDX_MAX); -- cgit v1.2.2 From c91e0f5da81c6f3a611a1bd6d0cca6717c90fdab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 22 Jan 2010 15:25:59 +0100 Subject: perf_event: x86: Clean up some of the u64/long bitmask casting We need this to be u64 for direct assigment, but the bitmask functions all work on unsigned long, leading to cast heaven, solve this by using a union. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100122155535.595961269@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 47 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 7e181a5097ea..921bbf732e77 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -69,10 +69,11 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; -#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64)) - struct event_constraint { - u64 idxmsk[BITS_TO_U64(X86_PMC_IDX_MAX)]; + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64[1]; + }; int code; int cmask; }; @@ -90,13 +91,14 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ - .code = (c), \ - .cmask = (m), \ - .idxmsk[0] = (n) } +#define EVENT_CONSTRAINT(c, n, m) { \ + { .idxmsk64[0] = (n) }, \ + .code = (c), \ + .cmask = (m), \ +} #define EVENT_CONSTRAINT_END \ - { .code = 0, .cmask = 0, .idxmsk[0] = 0 } + EVENT_CONSTRAINT(0, 0, 0) #define for_each_event_constraint(e, c) \ for ((e) = (c); (e)->cmask; (e)++) @@ -126,8 +128,11 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - void (*get_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event, u64 *idxmsk); - void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); + void (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event, + unsigned long *idxmsk); + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); const struct event_constraint *event_constraints; }; @@ -2144,14 +2149,11 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } -static struct event_constraint bts_constraint = { - .code = 0, - .cmask = 0, - .idxmsk[0] = 1ULL << X86_PMC_IDX_FIXED_BTS -}; +static struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); static int intel_special_constraints(struct perf_event *event, - u64 *idxmsk) + unsigned long *idxmsk) { unsigned int hw_event; @@ -2171,14 +2173,14 @@ static int intel_special_constraints(struct perf_event *event, static void intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, - u64 *idxmsk) + unsigned long *idxmsk) { const struct event_constraint *c; /* * cleanup bitmask */ - bitmap_zero((unsigned long *)idxmsk, X86_PMC_IDX_MAX); + bitmap_zero(idxmsk, X86_PMC_IDX_MAX); if (intel_special_constraints(event, idxmsk)) return; @@ -2186,10 +2188,7 @@ static void intel_get_event_constraints(struct cpu_hw_events *cpuc, if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { if ((event->hw.config & c->cmask) == c->code) { - - bitmap_copy((unsigned long *)idxmsk, - (unsigned long *)c->idxmsk, - X86_PMC_IDX_MAX); + bitmap_copy(idxmsk, c->idxmsk, X86_PMC_IDX_MAX); return; } } @@ -2200,10 +2199,10 @@ static void intel_get_event_constraints(struct cpu_hw_events *cpuc, static void amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, - u64 *idxmsk) + unsigned long *idxmsk) { /* no constraints, means supports all generic counters */ - bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events); + bitmap_fill(idxmsk, x86_pmu.num_events); } static int x86_event_sched_in(struct perf_event *event, -- cgit v1.2.2 From 8433be1184e4f22c37d4b8ed36cde529a47882f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 22 Jan 2010 15:38:26 +0100 Subject: perf_event: x86: Reduce some overly long lines with some MACROs Introduce INTEL_EVENT_CONSTRAINT and FIXED_EVENT_CONSTRAINT to reduce some line length and typing work. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100122155535.688730371@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 68 ++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 921bbf732e77..4d1ed101c10d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -97,6 +97,12 @@ struct cpu_hw_events { .cmask = (m), \ } +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) + +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) + #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) @@ -192,12 +198,12 @@ static u64 p6_pmu_raw_event(u64 hw_event) static struct event_constraint intel_p6_event_constraints[] = { - EVENT_CONSTRAINT(0xc1, 0x1, INTEL_ARCH_EVENT_MASK), /* FLOPS */ - EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ EVENT_CONSTRAINT_END }; @@ -217,41 +223,41 @@ static const u64 intel_perfmon_event_map[] = static struct event_constraint intel_core_event_constraints[] = { - EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ - EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ - EVENT_CONSTRAINT(0x10, 0x1, INTEL_ARCH_EVENT_MASK), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x2, INTEL_ARCH_EVENT_MASK), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2, INTEL_ARCH_EVENT_MASK), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2, INTEL_ARCH_EVENT_MASK), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1, INTEL_ARCH_EVENT_MASK), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT(0x18, 0x1, INTEL_ARCH_EVENT_MASK), /* IDLE_DURING_DIV */ - EVENT_CONSTRAINT(0x19, 0x2, INTEL_ARCH_EVENT_MASK), /* DELAYED_BYPASS */ - EVENT_CONSTRAINT(0xa1, 0x1, INTEL_ARCH_EVENT_MASK), /* RS_UOPS_DISPATCH_CYCLES */ - EVENT_CONSTRAINT(0xcb, 0x1, INTEL_ARCH_EVENT_MASK), /* MEM_LOAD_RETIRED */ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_nehalem_event_constraints[] = { - EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ - EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ - EVENT_CONSTRAINT(0x40, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LD */ - EVENT_CONSTRAINT(0x41, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_ST */ - EVENT_CONSTRAINT(0x42, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK */ - EVENT_CONSTRAINT(0x43, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_ALL_REF */ - EVENT_CONSTRAINT(0x4e, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_PREFETCH */ - EVENT_CONSTRAINT(0x4c, 0x3, INTEL_ARCH_EVENT_MASK), /* LOAD_HIT_PRE */ - EVENT_CONSTRAINT(0x51, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D */ - EVENT_CONSTRAINT(0x52, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0x53, 0x3, INTEL_ARCH_EVENT_MASK), /* L1D_CACHE_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0xc5, 0x3, INTEL_ARCH_EVENT_MASK), /* CACHE_LOCK_CYCLES */ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + INTEL_EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ + INTEL_EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_gen_event_constraints[] = { - EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), INTEL_ARCH_FIXED_MASK), /* INSTRUCTIONS_RETIRED */ - EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), INTEL_ARCH_FIXED_MASK), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ EVENT_CONSTRAINT_END }; -- cgit v1.2.2 From 63b146490befc027a7e0923e333269e68b20d380 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 22 Jan 2010 16:32:17 +0100 Subject: perf_event: x86: Optimize the constraint searching bits Instead of copying bitmasks around, pass pointers to the constraint structure. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100122155535.887853503@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 75 ++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4d1ed101c10d..092ad566734c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -134,12 +134,14 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - void (*get_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event, - unsigned long *idxmsk); + + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); - const struct event_constraint *event_constraints; + struct event_constraint *event_constraints; }; static struct x86_pmu x86_pmu __read_mostly; @@ -1242,17 +1244,15 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { int i, j , w, num; int weight, wmax; - unsigned long *c; - unsigned long constraints[X86_PMC_IDX_MAX][BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); for (i = 0; i < n; i++) { - x86_pmu.get_event_constraints(cpuc, - cpuc->event_list[i], - constraints[i]); + constraints[i] = + x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); } /* @@ -1267,7 +1267,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) break; /* constraint still honored */ - if (!test_bit(hwc->idx, c)) + if (!test_bit(hwc->idx, c->idxmsk)) break; /* not already used */ @@ -1320,11 +1320,11 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) c = constraints[i]; hwc = &cpuc->event_list[i]->hw; - weight = bitmap_weight(c, X86_PMC_IDX_MAX); + weight = bitmap_weight(c->idxmsk, X86_PMC_IDX_MAX); if (weight != w) continue; - for_each_bit(j, c, X86_PMC_IDX_MAX) { + for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; } @@ -2155,11 +2155,13 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } +static struct event_constraint unconstrained; + static struct event_constraint bts_constraint = EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); -static int intel_special_constraints(struct perf_event *event, - unsigned long *idxmsk) +static struct event_constraint * +intel_special_constraints(struct perf_event *event) { unsigned int hw_event; @@ -2169,46 +2171,34 @@ static int intel_special_constraints(struct perf_event *event, x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && (event->hw.sample_period == 1))) { - bitmap_copy((unsigned long *)idxmsk, - (unsigned long *)bts_constraint.idxmsk, - X86_PMC_IDX_MAX); - return 1; + return &bts_constraint; } - return 0; + return NULL; } -static void intel_get_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event, - unsigned long *idxmsk) +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - const struct event_constraint *c; + struct event_constraint *c; - /* - * cleanup bitmask - */ - bitmap_zero(idxmsk, X86_PMC_IDX_MAX); - - if (intel_special_constraints(event, idxmsk)) - return; + c = intel_special_constraints(event); + if (c) + return c; if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) { - bitmap_copy(idxmsk, c->idxmsk, X86_PMC_IDX_MAX); - return; - } + if ((event->hw.config & c->cmask) == c->code) + return c; } } - /* no constraints, means supports all generic counters */ - bitmap_fill((unsigned long *)idxmsk, x86_pmu.num_events); + + return &unconstrained; } -static void amd_get_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event, - unsigned long *idxmsk) +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - /* no constraints, means supports all generic counters */ - bitmap_fill(idxmsk, x86_pmu.num_events); + return &unconstrained; } static int x86_event_sched_in(struct perf_event *event, @@ -2576,6 +2566,9 @@ void __init init_hw_perf_events(void) perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); + unconstrained = (struct event_constraint) + EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0); + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); -- cgit v1.2.2 From 272d30be622c9c6cbd514b1211ff359292001baa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 22 Jan 2010 16:32:17 +0100 Subject: perf_event: x86: Optimize constraint weight computation Add a weight member to the constraint structure and avoid recomputing the weight at runtime. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100122155535.963944926@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 092ad566734c..2c22ce4fa784 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -23,6 +23,7 @@ #include <linux/uaccess.h> #include <linux/highmem.h> #include <linux/cpu.h> +#include <linux/bitops.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -76,6 +77,7 @@ struct event_constraint { }; int code; int cmask; + int weight; }; struct cpu_hw_events { @@ -95,6 +97,7 @@ struct cpu_hw_events { { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ + .weight = HWEIGHT64((u64)(n)), \ } #define INTEL_EVENT_CONSTRAINT(c, n) \ @@ -1242,8 +1245,7 @@ static inline int is_x86_event(struct perf_event *event) static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - int i, j , w, num; - int weight, wmax; + int i, j, w, num, wmax; struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct hw_perf_event *hwc; @@ -1320,8 +1322,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) c = constraints[i]; hwc = &cpuc->event_list[i]->hw; - weight = bitmap_weight(c->idxmsk, X86_PMC_IDX_MAX); - if (weight != w) + if (c->weight != w) continue; for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { -- cgit v1.2.2 From c933c1a603d5bf700ddce79216c1be0ec3bc0e6c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 22 Jan 2010 16:40:12 +0100 Subject: perf_event: x86: Optimize the fast path a little more Remove num from the fast path and save a few ops. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100122155536.056430539@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2c22ce4fa784..33c889ff21ae 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1245,9 +1245,9 @@ static inline int is_x86_event(struct perf_event *event) static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - int i, j, w, num, wmax; struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int i, j, w, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); @@ -1260,7 +1260,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* * fastpath, try to reuse previous register */ - for (i = 0, num = n; i < n; i++, num--) { + for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; c = constraints[i]; @@ -1288,7 +1288,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (assign) assign[i] = hwc->idx; } - if (!num) + if (i == n) goto done; /* -- cgit v1.2.2 From 6c9687abeb24d5b7aae7db5be070c2139ad29e29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Mon, 25 Jan 2010 11:57:25 +0100 Subject: perf_event: x86: Optimize x86_pmu_disable() x86_pmu_disable() removes the event from the cpuc->event_list[], however since an event can only be on that list once, stop looking after we found it. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 33c889ff21ae..66de282ad2fb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1884,6 +1884,7 @@ static void x86_pmu_disable(struct perf_event *event) cpuc->event_list[i-1] = cpuc->event_list[i]; --cpuc->n_events; + break; } } perf_event_update_userpage(event); -- cgit v1.2.2 From 184f412c3341cd24fbd26604634a5800b83dbdc3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 27 Jan 2010 08:39:39 +0100 Subject: perf, x86: Clean up event constraints code a bit - Remove stray debug code - Improve ugly macros a bit - Remove some whitespace damage - (Also fix up some accumulated damage in perf_event.h) Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Stephane Eranian <eranian@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <new-submission> --- arch/x86/kernel/cpu/perf_event.c | 37 ++++++++----------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66de282ad2fb..fdbe24842271 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,24 +93,19 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ +#define EVENT_CONSTRAINT(c, n, m) { \ { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = HWEIGHT64((u64)(n)), \ } -#define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define INTEL_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define FIXED_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) -#define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) +#define EVENT_CONSTRAINT_END EVENT_CONSTRAINT(0, 0, 0) -#define EVENT_CONSTRAINT_END \ - EVENT_CONSTRAINT(0, 0, 0) - -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->cmask; (e)++) +#define for_each_event_constraint(e, c) for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -1276,14 +1271,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (test_bit(hwc->idx, used_mask)) break; -#if 0 - pr_debug("CPU%d fast config=0x%llx idx=%d assign=%c\n", - smp_processor_id(), - hwc->config, - hwc->idx, - assign ? 'y' : 'n'); -#endif - set_bit(hwc->idx, used_mask); if (assign) assign[i] = hwc->idx; @@ -1333,14 +1320,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; -#if 0 - pr_debug("CPU%d slow config=0x%llx idx=%d assign=%c\n", - smp_processor_id(), - hwc->config, - j, - assign ? 'y' : 'n'); -#endif - set_bit(j, used_mask); if (assign) @@ -2596,9 +2575,9 @@ static const struct pmu pmu = { * validate a single event group * * validation include: - * - check events are compatible which each other - * - events do not compete for the same counter - * - number of events <= number of counters + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. -- cgit v1.2.2 From 2e8418736dff9c6fdadb2f87dcc2087cebf32167 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Mon, 25 Jan 2010 15:58:43 +0100 Subject: perf_event: x86: Deduplicate the disable code Share the meat of the x86_pmu_disable() code with hw_perf_enable(). Also remove the barrier() from that code, since I could not convince myself we actually need it. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fdbe24842271..07fa0c2faa09 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1401,6 +1401,8 @@ static inline void x86_assign_hw_event(struct perf_event *event, } } +static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc); + void hw_perf_enable(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1426,13 +1428,7 @@ void hw_perf_enable(void) if (hwc->idx == -1 || hwc->idx == cpuc->assign[i]) continue; - x86_pmu.disable(hwc, hwc->idx); - - clear_bit(hwc->idx, cpuc->active_mask); - barrier(); - cpuc->events[hwc->idx] = NULL; - - x86_perf_event_update(event, hwc, hwc->idx); + __x86_pmu_disable(event, cpuc); hwc->idx = -1; } @@ -1822,11 +1818,10 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) event->pending_kill = POLL_IN; } -static void x86_pmu_disable(struct perf_event *event) +static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int i, idx = hwc->idx; + int idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler @@ -1835,12 +1830,6 @@ static void x86_pmu_disable(struct perf_event *event) clear_bit(idx, cpuc->active_mask); x86_pmu.disable(hwc, idx); - /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the event: - */ - barrier(); - /* * Drain the remaining delta count out of a event * that we are disabling: @@ -1852,6 +1841,14 @@ static void x86_pmu_disable(struct perf_event *event) intel_pmu_drain_bts_buffer(cpuc); cpuc->events[idx] = NULL; +} + +static void x86_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; + + __x86_pmu_disable(event, cpuc); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { -- cgit v1.2.2 From ed8777fc132e589d48a0ba854fdbb5d8203b58e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Wed, 27 Jan 2010 23:07:46 +0100 Subject: perf_events, x86: Fix event constraint masks Since constraints are specified on the event number, not number and unit mask shorten the constraint masks so that we'll actually match something. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100127221121.967610372@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 07fa0c2faa09..951213a51489 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -100,12 +100,17 @@ struct cpu_hw_events { .weight = HWEIGHT64((u64)(n)), \ } -#define INTEL_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) -#define FIXED_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) -#define EVENT_CONSTRAINT_END EVENT_CONSTRAINT(0, 0, 0) +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) -#define for_each_event_constraint(e, c) for ((e) = (c); (e)->cmask; (e)++) +#define EVENT_CONSTRAINT_END \ + EVENT_CONSTRAINT(0, 0, 0) + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu -- cgit v1.2.2 From 1a6e21f791fe85b40a9ddbafe999ab8ccffc3f78 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Wed, 27 Jan 2010 23:07:47 +0100 Subject: perf_events, x86: Clean up hw_perf_*_all() implementation Put the recursion avoidance code in the generic hook instead of replicating it in each implementation. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100127221122.057507285@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 59 ++++++++++------------------------------ 1 file changed, 14 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 951213a51489..cf10839f20ea 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1099,15 +1099,8 @@ static int __hw_perf_event_init(struct perf_event *event) static void p6_pmu_disable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -1118,12 +1111,6 @@ static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) @@ -1135,17 +1122,6 @@ static void amd_pmu_disable_all(void) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - /* - * ensure we write the disable before we start disabling the - * events proper, so that amd_pmu_enable_event() does the - * right thing. - */ - barrier(); - for (idx = 0; idx < x86_pmu.num_events; idx++) { u64 val; @@ -1166,23 +1142,20 @@ void hw_perf_disable(void) if (!x86_pmu_initialized()) return; - if (cpuc->enabled) - cpuc->n_added = 0; + if (!cpuc->enabled) + return; + + cpuc->n_added = 0; + cpuc->enabled = 0; + barrier(); x86_pmu.disable_all(); } static void p6_pmu_enable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long val; - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -1193,12 +1166,6 @@ static void intel_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { @@ -1217,12 +1184,6 @@ static void amd_pmu_enable_all(void) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - for (idx = 0; idx < x86_pmu.num_events; idx++) { struct perf_event *event = cpuc->events[idx]; u64 val; @@ -1417,6 +1378,10 @@ void hw_perf_enable(void) if (!x86_pmu_initialized()) return; + + if (cpuc->enabled) + return; + if (cpuc->n_added) { /* * apply assignment obtained either from @@ -1461,6 +1426,10 @@ void hw_perf_enable(void) cpuc->n_added = 0; perf_events_lapic_init(); } + + cpuc->enabled = 1; + barrier(); + x86_pmu.enable_all(); } -- cgit v1.2.2 From 452a339a976e7f782c786eb3f73080401e2fa3a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Wed, 27 Jan 2010 23:07:48 +0100 Subject: perf_events, x86: Implement Intel Westmere support The new Intel documentation includes Westmere arch specific event maps that are significantly different from the Nehalem ones. Add support for this generation. Found the CPUID model numbers on wikipedia. Also ammend some Nehalem constraints, spotted those when looking for the differences between Nehalem and Westmere. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100127221122.151865645@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 124 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cf10839f20ea..3fac0bfc2dee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -244,18 +244,26 @@ static struct event_constraint intel_core_event_constraints[] = static struct event_constraint intel_nehalem_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - INTEL_EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - INTEL_EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - INTEL_EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ - INTEL_EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ EVENT_CONSTRAINT_END }; @@ -286,6 +294,97 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +static __initconst u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static __initconst u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -2423,7 +2522,9 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_core_event_constraints; pr_cont("Core2 events, "); break; - case 26: + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -2437,6 +2538,15 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("Atom events, "); break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + pr_cont("Westmere events, "); + break; default: /* * default constraints for v2 and up -- cgit v1.2.2 From 18c01f8abff51e4910cc5ffb4b710e8c6eea60c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Wed, 27 Jan 2010 23:07:49 +0100 Subject: perf_events, x86: Remove spurious counter reset from x86_pmu_enable() At enable time the counter might still have a ->idx pointing to a previously occupied location that might now be taken by another event. Resetting the counter at that location with data from this event will destroy the other counter's count. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <20100127221122.261477183@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3fac0bfc2dee..518eb3e39577 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1762,9 +1762,6 @@ static int x86_pmu_enable(struct perf_event *event) cpuc->n_events = n; cpuc->n_added = n - n0; - if (hwc->idx != -1) - x86_perf_event_set_period(event, hwc, hwc->idx); - return 0; } -- cgit v1.2.2 From 05d43ed8a89c159ff641d472f970e3f1baa66318 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Thu, 28 Jan 2010 22:14:43 -0800 Subject: x86: get rid of the insane TIF_ABI_PENDING bit Now that the previous commit made it possible to do the personality setting at the point of no return, we do just that for ELF binaries. And suddenly all the reasons for that insane TIF_ABI_PENDING bit go away, and we can just make SET_PERSONALITY() just do the obvious thing for a 32-bit compat process. Everything becomes much more straightforward this way. Signed-off-by: H. Peter Anvin <hpa@zytor.com> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/process.c | 12 ------------ arch/x86/kernel/process_64.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 02c3ee013ccd..c9b3522b6b46 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -115,18 +115,6 @@ void flush_thread(void) { struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } -#endif - flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f9e033150cdf..41a26a82470a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -521,6 +521,17 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + unsigned long get_wchan(struct task_struct *p) { unsigned long stack; -- cgit v1.2.2 From 69c89efb51510b3dc0fa336f7fa257c6e1799ee4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Fri, 29 Jan 2010 11:42:20 -0800 Subject: x86, irq: Update the vector domain for legacy irqs handled by io-apic In the recent change of not reserving IRQ0_VECTOR..IRQ15_VECTOR's on all cpu's, we start with irq 0..15 getting directed to (and handled on) cpu-0. In the logical flat mode, once the AP's are online (and before irqbalance comes into picture), kernel intends to handle these IRQ's on any cpu (as the logical flat mode allows to specify multiple cpu's for the irq destination and the chipset based routing can deliver to the interrupt to any one of the specified cpu's). This was broken with our recent change, which was ending up using only cpu 0 as the destination, even when the kernel was specifying to use all online cpu's for the logical flat mode case. Fix this by updating vector allocation domain (cfg->domain) for legacy irqs, when the IO-APIC handles them. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <20100129194330.207790269@sbs-t61.sc.intel.com> Tested-by: Li Zefan <lizf@cn.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1a30587a6bc2..2430b31c9857 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1428,6 +1428,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq cfg = desc->chip_data; + /* + * For legacy irqs, cfg->domain starts with cpu 0 for legacy + * controllers like 8259. Now that IO-APIC can handle this irq, update + * the cfg->domain. + */ + if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + apic->vector_allocation_domain(0, cfg->domain); + if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.2.2 From 9d133e5db993d577bd868b54083869fe5479fcff Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Fri, 29 Jan 2010 11:42:21 -0800 Subject: x86, irq: Move __setup_vector_irq() before the first irq enable in cpu online path Lowest priority delivery of logical flat mode is broken on some systems, such that even when IO-APIC RTE says deliver the interrupt to a particular CPU, interrupt subsystem delivers the interrupt to totally different CPU. For example, this behavior was observed on a P4 based system with SiS chipset which was reported by Li Zefan. We have been handling this kind of behavior by making sure that in logical flat mode, we assign the same vector to irq mappings on all the 8 possible logical cpu's. But we have been doing this initial assignment (__setup_vector_irq()) a little late (before which interrupts were already enabled for a short duration). Move the __setup_vector_irq() before the first irq enable point in the cpu online path to avoid the issue of not handling some interrupts that wrongly hit the cpu which is still coming online. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <20100129194330.283696385@sbs-t61.sc.intel.com> Tested-by: Li Zefan <lizf@cn.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 8 +++++++- arch/x86/kernel/smpboot.c | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2430b31c9857..937150e4c06d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1256,11 +1256,16 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; struct irq_desc *desc; + /* + * vector_lock will make sure that we don't run into irq vector + * assignments that might be happening on another cpu in parallel, + * while we setup our initial vector to irq mappings. + */ + spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; @@ -1279,6 +1284,7 @@ void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } + spin_unlock(&vector_lock); } static struct irq_chip ioapic_chip; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..b2ebcba729d9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -241,6 +241,11 @@ static void __cpuinit smp_callin(void) map_cpu_to_logical_apicid(); notify_cpu_starting(cpuid); + + /* + * Need to setup vector mappings before we enable interrupts. + */ + __setup_vector_irq(smp_processor_id()); /* * Get our bogomips. * @@ -315,7 +320,6 @@ notrace static void __cpuinit start_secondary(void *unused) */ ipi_call_lock(); lock_vector_lock(); - __setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); ipi_call_unlock(); -- cgit v1.2.2 From 7c099ce1575126395f186ecf58b51a60d5c3be7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= <david@hardeman.nu> Date: Thu, 28 Jan 2010 21:02:54 +0100 Subject: x86: Add quirk for Intel DG45FC board to avoid low memory corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 6aa542a694dc9ea4344a8a590d2628c33d1b9431 added a quirk for the Intel DG45ID board due to low memory corruption. The Intel DG45FC shares the same BIOS (and the same bug) as noted in: http://bugzilla.kernel.org/show_bug.cgi?id=13736 Signed-off-by: David Härdeman <david@hardeman.nu> LKML-Reference: <20100128200254.GA9134@hardeman.nu> Cc: <stable@kernel.org> Cc: Alexey Fisher <bug-track@fisher-privat.net> Cc: ykzhao <yakui.zhao@intel.com> Cc: Tony Bones <aabonesml@gmail.com> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/setup.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7b8b9894b22..5d9e40c58628 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -642,19 +642,27 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, #endif {} }; -- cgit v1.2.2 From cc0967490c1c3824bc5b75718b6ca8a51d9f2617 Mon Sep 17 00:00:00 2001 From: Jason Wessel <jason.wessel@windriver.com> Date: Thu, 28 Jan 2010 17:04:42 -0600 Subject: x86, hw_breakpoints, kgdb: Fix kgdb to use hw_breakpoint API In the 2.6.33 kernel, the hw_breakpoint API is now used for the performance event counters. The hw_breakpoint_handler() now consumes the hw breakpoints that were previously set by kgdb arch specific code. In order for kgdb to work in conjunction with this core API change, kgdb must use some of the low level functions of the hw_breakpoint API to install, uninstall, and deal with hw breakpoint reservations. The kgdb core required a change to call kgdb_disable_hw_debug anytime a slave cpu enters kgdb_wait() in order to keep all the hw breakpoints in sync as well as to prevent hitting a hw breakpoint while kgdb is active. During the architecture specific initialization of kgdb, it will pre-allocate 4 disabled (struct perf event **) structures. Kgdb will use these to manage the capabilities for the 4 hw breakpoint registers, per cpu. Right now the hw_breakpoint API does not have a way to ask how many breakpoints are available, on each CPU so it is possible that the install of a breakpoint might fail when kgdb restores the system to the run state. The intent of this patch is to first get the basic functionality of hw breakpoints working and leave it to the person debugging the kernel to understand what hw breakpoints are in use and what restrictions have been imposed as a result. Breakpoint constraints will be dealt with in a future patch. While atomic, the x86 specific kgdb code will call arch_uninstall_hw_breakpoint() and arch_install_hw_breakpoint() to manage the cpu specific hw breakpoints. The net result of these changes allow kgdb to use the same pool of hw_breakpoints that are used by the perf event API, but neither knows about future reservations for the available hw breakpoint slots. Signed-off-by: Jason Wessel <jason.wessel@windriver.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad <prasad@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/kgdb.c | 171 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 114 insertions(+), 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index dd74fe7273b1..62bea7307eaa 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -42,6 +42,7 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/nmi.h> +#include <linux/hw_breakpoint.h> #include <asm/debugreg.h> #include <asm/apicdef.h> @@ -204,40 +205,38 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) static struct hw_breakpoint { unsigned enabled; - unsigned type; - unsigned len; unsigned long addr; + int len; + int type; + struct perf_event **pev; } breakinfo[4]; static void kgdb_correct_hw_break(void) { - unsigned long dr7; - int correctit = 0; - int breakbit; int breakno; - get_debugreg(dr7, 7); for (breakno = 0; breakno < 4; breakno++) { - breakbit = 2 << (breakno << 1); - if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { - correctit = 1; - dr7 |= breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - dr7 |= ((breakinfo[breakno].len << 2) | - breakinfo[breakno].type) << - ((breakno << 2) + 16); - set_debugreg(breakinfo[breakno].addr, breakno); - - } else { - if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { - correctit = 1; - dr7 &= ~breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - } - } + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; } - if (correctit) - set_debugreg(dr7, 7); + hw_breakpoint_restore(); } static int @@ -259,15 +258,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) static void kgdb_remove_all_hw_break(void) { int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; - for (i = 0; i < 4; i++) - memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } static int kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { - unsigned type; int i; for (i = 0; i < 4; i++) @@ -278,27 +285,38 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) switch (bptype) { case BP_HARDWARE_BREAKPOINT: - type = 0; - len = 1; + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; break; case BP_WRITE_WATCHPOINT: - type = 1; + breakinfo[i].type = X86_BREAKPOINT_WRITE; break; case BP_ACCESS_WATCHPOINT: - type = 3; + breakinfo[i].type = X86_BREAKPOINT_RW; break; default: return -1; } - - if (len == 1 || len == 2 || len == 4) - breakinfo[i].len = len - 1; - else + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: return -1; - - breakinfo[i].enabled = 1; + } breakinfo[i].addr = addr; - breakinfo[i].type = type; + breakinfo[i].enabled = 1; return 0; } @@ -313,8 +331,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) */ void kgdb_disable_hw_debug(struct pt_regs *regs) { + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } /** @@ -378,7 +409,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, struct pt_regs *linux_regs) { unsigned long addr; - unsigned long dr6; char *ptr; int newPC; @@ -404,20 +434,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, raw_smp_processor_id()); } - get_debugreg(dr6, 6); - if (!(dr6 & 0x4000)) { - int breakno; - - for (breakno = 0; breakno < 4; breakno++) { - if (dr6 & (1 << breakno) && - breakinfo[breakno].type == 0) { - /* Set restore flag: */ - linux_regs->flags |= X86_EFLAGS_RF; - break; - } - } - } - set_debugreg(0UL, 6); kgdb_correct_hw_break(); return 0; @@ -485,8 +501,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) break; case DIE_DEBUG: - if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id()) { + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) return single_step_cont(regs, args); break; @@ -539,7 +554,42 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - return register_die_notifier(&kgdb_notifier); + int i, cpu; + int ret; + struct perf_event_attr attr; + struct perf_event **pevent; + + ret = register_die_notifier(&kgdb_notifier); + if (ret != 0) + return ret; + /* + * Pre-allocate the hw breakpoint structions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.type = PERF_TYPE_BREAKPOINT; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < 4; i++) { + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + if (IS_ERR(breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return -1; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } + return ret; } /** @@ -550,6 +600,13 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } unregister_die_notifier(&kgdb_notifier); } -- cgit v1.2.2 From 5352ae638e2d7d5c9b2e4d528676bbf2af6fd6f3 Mon Sep 17 00:00:00 2001 From: Jason Wessel <jason.wessel@windriver.com> Date: Thu, 28 Jan 2010 17:04:43 -0600 Subject: perf, hw_breakpoint, kgdb: Do not take mutex for kernel debugger This patch fixes the regression in functionality where the kernel debugger and the perf API do not nicely share hw breakpoint reservations. The kernel debugger cannot use any mutex_lock() calls because it can start the kernel running from an invalid context. A mutex free version of the reservation API needed to get created for the kernel debugger to safely update hw breakpoint reservations. The possibility for a breakpoint reservation to be concurrently processed at the time that kgdb interrupts the system is improbable. Should this corner case occur the end user is warned, and the kernel debugger will prohibit updating the hardware breakpoint reservations. Any time the kernel debugger reserves a hardware breakpoint it will be a system wide reservation. Signed-off-by: Jason Wessel <jason.wessel@windriver.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad <prasad@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-3-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/kgdb.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 62bea7307eaa..bfba6019d762 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -239,6 +239,49 @@ static void kgdb_correct_hw_break(void) hw_breakpoint_restore(); } +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responisble for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + static int kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { @@ -250,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) if (i == 4) return -1; + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } breakinfo[i].enabled = 0; return 0; @@ -316,6 +363,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) return -1; } breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } breakinfo[i].enabled = 1; return 0; -- cgit v1.2.2 From 3b9cfc0a99f88c0db7c72363620584a9b40b4543 Mon Sep 17 00:00:00 2001 From: Emese Revfy <re.emese@gmail.com> Date: Sun, 31 Jan 2010 20:16:34 +0100 Subject: x86, mtrr: Constify struct mtrr_ops This is part of the ops structure constification effort started by Arjan van de Ven et al. Benefits of this constification: * prevents modification of data that is shared (referenced) by many other structure instances at runtime * detects/prevents accidental (but not intentional) modification attempts on archs that enforce read-only kernel data at runtime * potentially better optimized code as the compiler can assume that the const data cannot be changed * the compiler/linker move const data into .rodata and therefore exclude them from false sharing Signed-off-by: Emese Revfy <re.emese@gmail.com> LKML-Reference: <4B65D712.3080804@gmail.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/mtrr/amd.c | 2 +- arch/x86/kernel/cpu/mtrr/centaur.c | 2 +- arch/x86/kernel/cpu/mtrr/cyrix.c | 2 +- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- arch/x86/kernel/cpu/mtrr/mtrr.h | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 33af14110dfd..92ba9cd31c9a 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) return 0; } -static struct mtrr_ops amd_mtrr_ops = { +static const struct mtrr_ops amd_mtrr_ops = { .vendor = X86_VENDOR_AMD, .set = amd_set_mtrr, .get = amd_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index de89f14eff3a..316fe3e60a97 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t return 0; } -static struct mtrr_ops centaur_mtrr_ops = { +static const struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, .set = centaur_set_mcr, .get = centaur_get_mcr, diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 228d982ce09c..68a3343e5798 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -265,7 +265,7 @@ static void cyrix_set_all(void) post_set(); } -static struct mtrr_ops cyrix_mtrr_ops = { +static const struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, .set_all = cyrix_set_all, .set = cyrix_set_arr, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 55da0c5f68dd..4d755846fee6 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -752,7 +752,7 @@ int positive_have_wrcomb(void) /* * Generic structure... */ -struct mtrr_ops generic_mtrr_ops = { +const struct mtrr_ops generic_mtrr_ops = { .use_intel_if = 1, .set_all = generic_set_all, .get = generic_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 84e83de54575..fe4622e8c837 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops *mtrr_if; +const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops *ops) +void set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index a501dee9a87a..df5e41f31a27 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size, extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); -extern struct mtrr_ops generic_mtrr_ops; +extern const struct mtrr_ops generic_mtrr_ops; extern int positive_have_wrcomb(void); @@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops *ops); +extern void set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops *mtrr_if; +extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) -- cgit v1.2.2 From 1b5576e69a5fe168c08a159685ac366316ac9bbc Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Fri, 22 Jan 2010 11:21:04 +0800 Subject: x86: Remove BIOS data range from e820 In preparation for moving to the generic page_is_ram(), make explicit what we expect to be reserved and not reserved. Tested-by: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <20100122033004.335813103@intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 8 ++++++++ arch/x86/kernel/setup.c | 19 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d17d482a04f4..230687ba5ba5 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -517,11 +517,19 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype) { int i; + u64 end; u64 real_removed_size = 0; if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; + printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cdb6a8a506dd..f9b1f4e5ab74 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -650,6 +650,23 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { {} }; +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + /* + * special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + * take them out. + */ + e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -813,7 +830,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - + trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, -- cgit v1.2.2 From f266d7f5f89652a68e21e9882c44ee9104ad8d61 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Wed, 3 Feb 2010 21:21:32 +0200 Subject: x86_64: Print modules like i386 does Print modules list during kernel BUG. Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/dumpstack_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f5..907a90e2901c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -291,6 +291,7 @@ void show_registers(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); + print_modules(); __show_regs(regs, 1); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); -- cgit v1.2.2 From 615d0ebbc782b67296e3226c293f520f93f93515 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Tue, 2 Feb 2010 16:49:04 -0500 Subject: kprobes: Disable booster when CONFIG_PREEMPT=y Disable kprobe booster when CONFIG_PREEMPT=y at this time, because it can't ensure that all kernel threads preempted on kprobe's boosted slot run out from the slot even using freeze_processes(). The booster on preemptive kernel will be resumed if synchronize_tasks() or something like that is introduced. Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Jim Keniston <jkenisto@us.ibm.com> Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Cc: Steven Rostedt <rostedt@goodmis.org> LKML-Reference: <20100202214904.4694.24330.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5b8c7505b3bc..9453815138fa 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -429,7 +429,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) +#if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); -- cgit v1.2.2 From 2cfa19780d61740f65790c5bae363b759d7c96fa Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Tue, 2 Feb 2010 16:49:11 -0500 Subject: ftrace/alternatives: Introducing *_text_reserved functions Introducing *_text_reserved functions for checking the text address range is partially reserved or not. This patch provides checking routines for x86 smp alternatives and dynamic ftrace. Since both functions modify fixed pieces of kernel text, they should reserve and protect those from other dynamic text modifier, like kprobes. This will also be extended when introducing other subsystems which modify fixed pieces of kernel text. Dynamic text modifiers should avoid those. Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Jim Keniston <jkenisto@us.ibm.com> Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Cc: Jason Baron <jbaron@redhat.com> LKML-Reference: <20100202214911.4694.16587.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/alternative.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de7353c0ce9c..3c13284ff86d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -390,6 +390,22 @@ void alternatives_smp_switch(int smp) mutex_unlock(&smp_alt); } +/* Return 1 if the address range is reserved for smp-alternatives */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + u8 **ptr; + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > end || mod->text_end < start) + continue; + for (ptr = mod->locks; ptr < mod->locks_end; ptr++) + if (start <= *ptr && end >= *ptr) + return 1; + } + + return 0; +} #endif #ifdef CONFIG_PARAVIRT -- cgit v1.2.2 From 4554dbcb85a4ed2abaa2b6fa15649b796699ec89 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Tue, 2 Feb 2010 16:49:18 -0500 Subject: kprobes: Check probe address is reserved Check whether the address of new probe is already reserved by ftrace or alternatives (on x86) when registering new probe. If reserved, it returns an error and not register the probe. Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Jim Keniston <jkenisto@us.ibm.com> Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Cc: Jason Baron <jbaron@redhat.com> LKML-Reference: <20100202214918.4694.94179.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/kprobes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 9453815138fa..5de9f4a9c3fd 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -337,6 +337,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + if (!can_probe((unsigned long)p->addr)) return -EILSEQ; /* insn: must be on special executable page on x86. */ -- cgit v1.2.2 From 8c48e444191de0ff84e85d41180d7bc3e74f14ef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 29 Jan 2010 13:25:31 +0100 Subject: perf_events, x86: Implement intel core solo/duo support Implement Intel Core Solo/Duo, aka. Intel Architectural Performance Monitoring Version 1. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Arjan van de Ven <arjan@linux.intel.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 133 ++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 72 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1846ead0576b..5b91992b6b25 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -227,6 +227,17 @@ static const u64 intel_perfmon_event_map[] = }; static struct event_constraint intel_core_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ @@ -1216,7 +1227,7 @@ static void intel_pmu_disable_all(void) intel_pmu_disable_bts(); } -static void amd_pmu_disable_all(void) +static void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -1226,11 +1237,11 @@ static void amd_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + rdmsrl(x86_pmu.eventsel + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -1278,7 +1289,7 @@ static void intel_pmu_enable_all(void) } } -static void amd_pmu_enable_all(void) +static void x86_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -1292,7 +1303,7 @@ static void amd_pmu_enable_all(void) val = event->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -1546,7 +1557,7 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); @@ -1598,12 +1609,6 @@ intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) x86_pmu_disable_event(hwc, idx); } -static inline void -amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - x86_pmu_disable_event(hwc, idx); -} - static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* @@ -1723,15 +1728,14 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) return; } - x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(hwc, idx); } -static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (cpuc->enabled) - x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(hwc, idx); } /* @@ -1988,50 +1992,6 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } -static int p6_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - struct perf_event *event; - struct hw_perf_event *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - data.raw = NULL; - - cpuc = &__get_cpu_var(cpu_hw_events); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - event = cpuc->events[idx]; - hwc = &event->hw; - - val = x86_perf_event_update(event, hwc, idx); - if (val & (1ULL << (x86_pmu.event_bits - 1))) - continue; - - /* - * event overflow - */ - handled = 1; - data.period = event->hw.last_period; - - if (!x86_perf_event_set_period(event, hwc, idx)) - continue; - - if (perf_event_overflow(event, 1, &data, regs)) - p6_pmu_disable_event(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -2098,7 +2058,7 @@ again: return 1; } -static int amd_pmu_handle_irq(struct pt_regs *regs) +static int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; @@ -2133,7 +2093,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - amd_pmu_disable_event(hwc, idx); + x86_pmu.disable(hwc, idx); } if (handled) @@ -2374,7 +2334,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { static __initconst struct x86_pmu p6_pmu = { .name = "p6", - .handle_irq = p6_pmu_handle_irq, + .handle_irq = x86_pmu_handle_irq, .disable_all = p6_pmu_disable_all, .enable_all = p6_pmu_enable_all, .enable = p6_pmu_enable_event, @@ -2401,6 +2361,29 @@ static __initconst struct x86_pmu p6_pmu = { .event_constraints = intel_p6_event_constraints }; +static __initconst struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_core_event_constraints, +}; + static __initconst struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -2427,11 +2410,11 @@ static __initconst struct x86_pmu intel_pmu = { static __initconst struct x86_pmu amd_pmu = { .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_event, - .disable = amd_pmu_disable_event, + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, @@ -2498,9 +2481,10 @@ static __init int intel_pmu_init(void) version = eax.split.version_id; if (version < 2) - return -ENODEV; + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; - x86_pmu = intel_pmu; x86_pmu.version = version; x86_pmu.num_events = eax.split.num_events; x86_pmu.event_bits = eax.split.bit_width; @@ -2510,12 +2494,17 @@ static __init int intel_pmu_init(void) * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: */ - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + if (version > 1) + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); /* * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ @@ -2523,7 +2512,7 @@ static __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - x86_pmu.event_constraints = intel_core_event_constraints; + x86_pmu.event_constraints = intel_core2_event_constraints; pr_cont("Core2 events, "); break; -- cgit v1.2.2 From fce877e3a429940a986e085a41e8b57f2d922e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 29 Jan 2010 13:25:12 +0100 Subject: bitops: Ensure the compile time HWEIGHT is only used for such Avoid accidental misuse by failing to compile things Suggested-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Linus Torvalds <torvalds@linux-foundation.org> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5b91992b6b25..96cfc1a4fe9f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,13 +93,16 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ +#define __EVENT_CONSTRAINT(c, n, m, w) {\ { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ - .weight = HWEIGHT64((u64)(n)), \ + .weight = (w), \ } +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) + #define INTEL_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) @@ -2622,7 +2625,8 @@ void __init init_hw_perf_events(void) register_die_notifier(&perf_event_nmi_notifier); unconstrained = (struct event_constraint) - EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0); + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, + 0, x86_pmu.num_events); pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); -- cgit v1.2.2 From 447a194b393f32699607fd99617a40abd6a95114 Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Mon, 1 Feb 2010 14:50:01 +0200 Subject: perf_events, x86: Fix bug in hw_perf_enable() We cannot assume that because hwc->idx == assign[i], we can avoid reprogramming the counter in hw_perf_enable(). The event may have been scheduled out and another event may have been programmed into this counter. Thus, we need a more robust way of verifying if the counter still contains config/data related to an event. This patch adds a generation number to each counter on each cpu. Using this mechanism we can verify reliabilty whether the content of a counter corresponds to an event. Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> LKML-Reference: <4b66dc67.0b38560a.1635.ffffae18@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 96cfc1a4fe9f..a920f173a220 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -90,6 +90,7 @@ struct cpu_hw_events { int n_events; int n_added; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; @@ -1142,6 +1143,8 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->config = ARCH_PERFMON_EVENTSEL_INT; hwc->idx = -1; + hwc->last_cpu = -1; + hwc->last_tag = ~0ULL; /* * Count user and OS events unless requested not to. @@ -1457,11 +1460,14 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, return n; } - static inline void x86_assign_hw_event(struct perf_event *event, - struct hw_perf_event *hwc, int idx) + struct cpu_hw_events *cpuc, int i) { - hwc->idx = idx; + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; @@ -1480,6 +1486,15 @@ static inline void x86_assign_hw_event(struct perf_event *event, } } +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) +{ + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; +} + static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc); void hw_perf_enable(void) @@ -1508,7 +1523,14 @@ void hw_perf_enable(void) event = cpuc->event_list[i]; hwc = &event->hw; - if (hwc->idx == -1 || hwc->idx == cpuc->assign[i]) + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) continue; __x86_pmu_disable(event, cpuc); @@ -1522,12 +1544,12 @@ void hw_perf_enable(void) hwc = &event->hw; if (hwc->idx == -1) { - x86_assign_hw_event(event, hwc, cpuc->assign[i]); + x86_assign_hw_event(event, cpuc, i); x86_perf_event_set_period(event, hwc, hwc->idx); } /* * need to mark as active because x86_pmu_disable() - * clear active_mask and eventsp[] yet it preserves + * clear active_mask and events[] yet it preserves * idx */ set_bit(hwc->idx, cpuc->active_mask); -- cgit v1.2.2 From 34d2819f20782feb60f9434470ecfb200875fd41 Mon Sep 17 00:00:00 2001 From: Borislav Petkov <petkovbb@googlemail.com> Date: Thu, 4 Feb 2010 09:51:28 +0100 Subject: x86, mtrr: Remove unused mtrr/state.c The last reference to the helpers in <arch/x86/kernel/cpu/mtrr/state.c> went away with 9a6b344ea967efa0bb5ca4cb5405f840652b66c4 leaving unused code. Remove it. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> LKML-Reference: <20100204085128.GA513@liondog.tnic> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mtrr/Makefile | 2 +- arch/x86/kernel/cpu/mtrr/state.c | 94 --------------------------------------- 2 files changed, 1 insertion(+), 95 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mtrr/state.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index f4361b56f8e9..ad9e5ed81181 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o state.o cleanup.o +obj-y := main.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c deleted file mode 100644 index dfc80b4e6b0d..000000000000 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ /dev/null @@ -1,94 +0,0 @@ -#include <linux/init.h> -#include <linux/io.h> -#include <linux/mm.h> - -#include <asm/processor-cyrix.h> -#include <asm/processor-flags.h> -#include <asm/mtrr.h> -#include <asm/msr.h> - -#include "mtrr.h" - -/* Put the processor into a state where MTRRs can be safely set */ -void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) -{ - unsigned int cr0; - - /* Disable interrupts locally */ - local_irq_save(ctxt->flags); - - if (use_intel() || is_cpu(CYRIX)) { - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { - ctxt->cr4val = read_cr4(); - write_cr4(ctxt->cr4val & ~X86_CR4_PGE); - } - - /* - * Disable and flush caches. Note that wbinvd flushes the TLBs - * as a side-effect - */ - cr0 = read_cr0() | X86_CR0_CD; - wbinvd(); - write_cr0(cr0); - wbinvd(); - - if (use_intel()) { - /* Save MTRR state */ - rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else were excluded at the top - */ - ctxt->ccr3 = getCx86(CX86_CCR3); - } - } -} - -void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) -{ - if (use_intel()) { - /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, - ctxt->deftype_hi); - } else { - if (is_cpu(CYRIX)) { - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); - } - } -} - -/* Restore the processor after a set_mtrr_prepare */ -void set_mtrr_done(struct set_mtrr_context *ctxt) -{ - if (use_intel() || is_cpu(CYRIX)) { - - /* Flush caches and TLBs */ - wbinvd(); - - /* Restore MTRRdefType */ - if (use_intel()) { - /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, - ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else was excluded at the top - */ - setCx86(CX86_CCR3, ctxt->ccr3); - } - - /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); - - /* Restore value of CR4 */ - if (cpu_has_pge) - write_cr4(ctxt->cr4val); - } - /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore(ctxt->flags); -} -- cgit v1.2.2 From 5d93a14241bf5ba299422440bc366ec43970c002 Mon Sep 17 00:00:00 2001 From: Shaun Patterson <shaunpatterson@gmail.com> Date: Sat, 5 Dec 2009 22:30:52 -0500 Subject: vmiclock: fix comment spelling mistake Signed-off-by: Shaun Patterson <shaunpatterson@gmail.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz> --- arch/x86/kernel/vmiclock_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 74c92bb194df..25bbb9bfc312 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -171,7 +171,7 @@ static int vmi_timer_next_event(unsigned long delta, { /* Unfortunately, set_next_event interface only passes relative * expiry, but we want absolute expiry. It'd be better if were - * were passed an aboslute expiry, since a bunch of time may + * were passed an absolute expiry, since a bunch of time may * have been stolen between the time the delta is computed and * when we set the alarm below. */ cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); -- cgit v1.2.2 From e34b7005e5f55a55964c13ec9784e8e2b427a83c Mon Sep 17 00:00:00 2001 From: Jasper Spaans <spaans@fox-it.com> Date: Fri, 20 Nov 2009 14:20:05 +0100 Subject: arch/x86/kernel/apic/apic_flat_64.c: Make comment match the code Make the comment match the code, this also holds for intel systems, according to probe_64.c in the same directory. Signed-off-by: Jasper Spaans <spaans@fox-it.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz> --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index e3c3d820c325..09d3b17ce0c2 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -223,7 +223,7 @@ struct apic apic_flat = { }; /* - * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * Physflat mode is used when there are more than 8 CPUs on a system. * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ -- cgit v1.2.2 From c9404c9c392d557a4687c4cbda022b03cb787ce9 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder <adam.buchbinder@gmail.com> Date: Fri, 18 Dec 2009 15:40:42 -0500 Subject: Fix misspelling of "should" and "shouldn't" in comments. Some comments misspell "should" or "shouldn't"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder <adam.buchbinder@gmail.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz> --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..118428085ea2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -604,7 +604,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct perf_event_attr attr; /* - * We shoud have at least an inactive breakpoint at this + * We should have at least an inactive breakpoint at this * slot. It means the user is writing dr7 without having * written the address register first */ -- cgit v1.2.2 From fb637f3cd31783db2b654842ea32ffec15c4bd62 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" <justinmattock@gmail.com> Date: Thu, 14 Jan 2010 22:16:16 -0800 Subject: fix comment typo in pci-dma.c Signed-off-by: Justin P. Mattock <justinmattock@gmail.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz> --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 75e14e21f61a..eec33a7d96a0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -38,7 +38,7 @@ int iommu_detected __read_mostly = 0; * This variable becomes 1 if iommu=pt is passed on the kernel command line. * If this variable is 1, IOMMU implementations do no DMA translation for * devices and allow every device to access to whole physical memory. This is - * useful if a user want to use an IOMMU only for KVM device assignment to + * useful if a user wants to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. */ int iommu_pass_through __read_mostly; -- cgit v1.2.2 From 17622339af2536b32cf29699ddd4ba0fe79a61d5 Mon Sep 17 00:00:00 2001 From: Magnus Damm <damm@opensource.se> Date: Tue, 2 Feb 2010 14:41:39 -0800 Subject: clocksource: add argument to resume callback Pass the clocksource as an argument to the clocksource resume callback. Needed so we can point out which CMT channel the sh_cmt.c driver shall resume. Signed-off-by: Magnus Damm <damm@opensource.se> Cc: john stultz <johnstul@us.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/tsc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad80a1c718c6..ee4fa1bfcb33 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -266,7 +266,7 @@ static void hpet_resume_device(void) force_hpet_resume(); } -static void hpet_resume_counter(void) +static void hpet_resume_counter(struct clocksource *cs) { hpet_resume_device(); hpet_restart_counter(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba0..9eeb9be26aa4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif -static void resume_tsc(void) +static void resume_tsc(struct clocksource *cs) { clocksource_tsc.cycle_last = 0; } -- cgit v1.2.2 From 841582ea9e29a8f757c30c5377ce649586ba793a Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Tue, 2 Feb 2010 14:38:14 -0800 Subject: x86, uv: Update UV arch to target Legacy VGA I/O correctly. Add function to direct Legacy VGA I/O traffic to correct I/O Hub. Signed-off-by: Mike Travis <travis@sgi.com> LKML-Reference: <201002022238.o12McEbi018727@imap1.linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Robin Holt <holt@sgi.com> Cc: Jack Steiner <steiner@sgi.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Jesse Barnes <jbarnes@virtuousgeek.org> Cc: David Airlie <airlied@linux.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/x2apic_uv_x.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/bios_uv.c | 19 +++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 21db3cbea7dc..6ef2899eb861 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -20,6 +20,7 @@ #include <linux/cpu.h> #include <linux/init.h> #include <linux/io.h> +#include <linux/pci.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -34,6 +35,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); +#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) + static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; int uv_min_hub_revision_id; @@ -553,6 +556,30 @@ late_initcall(uv_init_heartbeat); #endif /* !CONFIG_HOTPLUG_CPU */ +/* Direct Legacy VGA I/O traffic to designated IOH */ +int uv_set_vga_state(struct pci_dev *pdev, bool decode, + unsigned int command_bits, bool change_bridge) +{ + int domain, bus, rc; + + PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n", + pdev->devfn, decode, command_bits, change_bridge); + + if (!change_bridge) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); + + return rc; +} + /* * Called on each cpu to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet @@ -691,4 +718,7 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); proc_mkdir("sgi_uv", NULL); + + /* register Legacy VGA I/O redirection handler */ + pci_register_set_vga_state(uv_set_vga_state); } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index b0206a211b09..575127a6e352 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -154,6 +154,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) } EXPORT_SYMBOL_GPL(uv_bios_freq_base); +/* + * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target + * @decode: true to enable target, false to disable target + * @domain: PCI domain number + * @bus: PCI bus number + * + * Returns: + * 0: Success + * -EINVAL: Invalid domain or bus number + * -ENOSYS: Capability not available + * -EBUSY: Legacy VGA I/O cannot be retargeted at this time + */ +int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) +{ + return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, + (u64)decode, (u64)domain, (u64)bus, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); + #ifdef CONFIG_EFI void uv_bios_init(void) -- cgit v1.2.2 From 3235dc3f22378f35ce77eba0d0f62db2d9c4844e Mon Sep 17 00:00:00 2001 From: Frans Pop <elendil@planet.nl> Date: Sat, 6 Feb 2010 18:47:17 +0100 Subject: x86: Remove trailing spaces in messages Signed-off-by: Frans Pop <elendil@planet.nl> Cc: Avi Kivity <avi@redhat.com> Cc: x86@kernel.org LKML-Reference: <1265478443-31072-10-git-send-email-elendil@planet.nl> [ Left out the KVM bits. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/apm_32.c | 4 ++-- arch/x86/kernel/efi.c | 2 +- arch/x86/kernel/microcode_intel.c | 2 +- arch/x86/kernel/uv_sysfs.c | 6 +++--- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e80f291472a4..71c4443bb91f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -587,7 +587,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld) \n", + "PM-Timer: %lu (%ld)\n", (unsigned long)res, *deltatsc); *deltatsc = (long)res; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816d..6bdd2c7ead75 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1647,7 +1647,7 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); + " Stat Dmod Deli Vect:\n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 98c4665f251c..47dd856708e5 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) mpc_record = 0; printk(KERN_INFO - "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + "Found an OEM MPC table at %8p - parsing it...\n", oemtable); if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { printk(KERN_WARNING diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index b5b6b23bce53..031aa887b0eb 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d) apm_info.disabled = 1; printk(KERN_INFO "%s machine detected. " "Disabling APM.\n", d->ident); - printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); - printk(KERN_INFO "download from support.intel.com \n"); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); } return 0; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index cdcfb122f256..c2fa9b8b497e 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -362,7 +362,7 @@ void __init efi_init(void) printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); early_iounmap(tmp, 2); - printk(KERN_INFO "EFI v%u.%.02u by %s \n", + printk(KERN_INFO "EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index ebd193e476ca..85a343e28937 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -328,7 +328,7 @@ static int apply_microcode(int cpu) cpu_num, mc_intel->hdr.rev); return -1; } - pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c index 36afb98675a4..309c70fb7759 100644 --- a/arch/x86/kernel/uv_sysfs.c +++ b/arch/x86/kernel/uv_sysfs.c @@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void) if (!sgi_uv_kobj) sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); return -EINVAL; } ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); return ret; } ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); return ret; } -- cgit v1.2.2 From 076dc4a65a6d99a16979e2c7917e669fb8c91ee5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Fri, 5 Feb 2010 12:16:47 -0500 Subject: x86/alternatives: Fix build warning Fixes these warnings: arch/x86/kernel/alternative.c: In function 'alternatives_text_reserved': arch/x86/kernel/alternative.c:402: warning: comparison of distinct pointer types lacks a cast arch/x86/kernel/alternative.c:402: warning: comparison of distinct pointer types lacks a cast arch/x86/kernel/alternative.c:405: warning: comparison of distinct pointer types lacks a cast arch/x86/kernel/alternative.c:405: warning: comparison of distinct pointer types lacks a cast Caused by: 2cfa197: ftrace/alternatives: Introducing *_text_reserved functions Changes in v2: - Use local variables to compare, instead of type casts. Reported-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> LKML-Reference: <20100205171647.15750.37221.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/alternative.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3c13284ff86d..e63b80e5861c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -395,12 +395,14 @@ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; u8 **ptr; + u8 *text_start = start; + u8 *text_end = end; list_for_each_entry(mod, &smp_alt_modules, next) { - if (mod->text > end || mod->text_end < start) + if (mod->text > text_end || mod->text_end < text_start) continue; for (ptr = mod->locks; ptr < mod->locks_end; ptr++) - if (start <= *ptr && end >= *ptr) + if (text_start <= *ptr && text_end >= *ptr) return 1; } -- cgit v1.2.2 From 3ad2f3fbb961429d2aa627465ae4829758bc7e07 Mon Sep 17 00:00:00 2001 From: Daniel Mack <daniel@caiaq.de> Date: Wed, 3 Feb 2010 08:01:28 +0800 Subject: tree-wide: Assorted spelling fixes In particular, several occurances of funny versions of 'success', 'unknown', 'therefore', 'acknowledge', 'argument', 'achieve', 'address', 'beginning', 'desirable', 'separate' and 'necessary' are fixed. Signed-off-by: Daniel Mack <daniel@caiaq.de> Cc: Joe Perches <joe@perches.com> Cc: Junio C Hamano <gitster@pobox.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz> --- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/tsc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2d8b5035371c..3d1e6f16b7a6 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -27,7 +27,7 @@ #define GET_CR2_INTO_RCX movq %cr2, %rcx #endif -/* we are not able to switch in one step to the final KERNEL ADRESS SPACE +/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. * */ diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 2bbde6078143..fb99f7edb341 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) /* * get_tce_space_from_tar(): * Function for kdump case. Get the tce tables from first kernel - * by reading the contents of the base adress register of calgary iommu + * by reading the contents of the base address register of calgary iommu */ static void __init get_tce_space_from_tar(void) { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba0..dec8f68e3eda 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -50,7 +50,7 @@ u64 native_sched_clock(void) * unstable. We do this because unlike Time Of Day, * the scheduler clock tolerates small errors and it's * very important for it to be as fast as the platform - * can achive it. ) + * can achieve it. ) */ if (unlikely(tsc_disabled)) { /* No locking but a rare wrong value is not a big deal: */ -- cgit v1.2.2 From 681ee44d40d7c93b42118320e4620d07d8704fd6 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Tue, 9 Feb 2010 18:01:44 -0800 Subject: x86, apic: Don't use logical-flat mode when CPU hotplug may exceed 8 CPUs We need to fall back from logical-flat APIC mode to physical-flat mode when we have more than 8 CPUs. However, in the presence of CPU hotplug(with bios listing not enabled but possible cpus as disabled cpus in MADT), we have to consider the number of possible CPUs rather than the number of current CPUs; otherwise we may cross the 8-CPU boundary when CPUs are added later. 32bit apic code can use more cleanups (like the removal of vendor checks in 32bit default_setup_apic_routing()) and more unifications with 64bit code. Yinghai has some patches in works already. This patch addresses the boot issue that is reported in the virtualization guest context. [ hpa: incorporated function annotation feedback from Yinghai Lu ] Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <1265767304.2833.19.camel@sbs-t61.sc.intel.com> Acked-by: Shaohui Zheng <shaohui.zheng@intel.com> Reviewed-by: Yinghai Lu <yinghai@kernel.org> Cc: <stable@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/acpi/boot.c | 5 ----- arch/x86/kernel/apic/apic.c | 17 ----------------- arch/x86/kernel/apic/probe_32.c | 29 +++++++++++++++++++++++++++-- arch/x86/kernel/apic/probe_64.c | 2 +- arch/x86/kernel/mpparse.c | 7 ------- arch/x86/kernel/smpboot.c | 2 -- 6 files changed, 28 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 036d28adf59d..0acbcdfa5ca4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1185,9 +1185,6 @@ static void __init acpi_process_madt(void) if (!error) { acpi_lapic = 1; -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif /* * Parse MADT IO-APIC entries */ @@ -1197,8 +1194,6 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - if (apic->setup_apic_routing) - apic->setup_apic_routing(); } } if (error == -EINVAL) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3987e4408f75..dfca210f6a10 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1641,9 +1641,7 @@ int __init APIC_init_uniprocessor(void) #endif enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif verify_local_APIC(); connect_bsp_APIC(); @@ -1891,21 +1889,6 @@ void __cpuinit generic_processor_info(int apicid, int version) if (apicid > max_physical_apicid) max_physical_apicid = apicid; -#ifdef CONFIG_X86_32 - if (num_processors > 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } -#endif - #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1a6559f6768c..99d2fe016084 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -52,7 +52,32 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void default_setup_apic_routing(void) +void __init default_setup_apic_routing(void) +{ + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + +#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); +} + +static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC printk(KERN_INFO @@ -103,7 +128,7 @@ struct apic apic_default = { .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = default_setup_apic_routing, + .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 450fe2064a14..83e9be4778e2 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -67,7 +67,7 @@ void __init default_setup_apic_routing(void) } #endif - if (apic == &apic_flat && num_processors > 8) + if (apic == &apic_flat && num_possible_cpus() > 8) apic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 40b54ceb68b5..a2c1edd2d3ac 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) x86_init.mpparse.mpc_record(1); } -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); - if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..b4e870cbdc60 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1083,9 +1083,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); -- cgit v1.2.2 From 0271f91003d3703675be13b8865618359a6caa1f Mon Sep 17 00:00:00 2001 From: Haicheng Li <haicheng.li@linux.intel.com> Date: Thu, 4 Feb 2010 19:06:33 +0800 Subject: x86, acpi: Map hotadded cpu to correct node. When hotadd new cpu to system, if its affinitive node is online, should map the cpu to its own node. Otherwise, let kernel select one online node for the new cpu later. Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com> LKML-Reference: <4B6AAA39.6000300@linux.intel.com> Tested-by: Thomas Renninger <trenn@suse.de> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/acpi/boot.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 036d28adf59d..7db15e161aa0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -49,6 +49,7 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include <asm/proto.h> +# include <asm/numa_64.h> #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -482,6 +483,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid == -1 || !node_online(nid)) + return; +#ifdef CONFIG_X86_64 + apicid_to_node[physid] = nid; + numa_set_node(cpu, nid); +#else /* CONFIG_X86_32 */ + apicid_2_node[physid] = nid; + cpu_to_node_map[cpu] = nid; +#endif + +#endif +} + static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -540,6 +560,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) } cpu = cpumask_first(new_map); + acpi_map_cpu2node(handle, cpu, physid); *pcpu = cpu; retval = 0; -- cgit v1.2.2 From 18dce6ba5c8c6bd0f3ab4efa4cbdd698dab5c40a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:05 -0800 Subject: x86: Fix SCI on IOAPIC != 0 Thomas Renninger <trenn@suse.de> reported on IBM x3330 booting a latest kernel on this machine results in: PCI: PCI BIOS revision 2.10 entry at 0xfd61c, last bus=1 PCI: Using configuration type 1 for base access bio: create slab <bio-0> at 0 ACPI: SCI (IRQ30) allocation failed ACPI Exception: AE_NOT_ACQUIRED, Unable to install System Control Interrupt handler (20090903/evevent-161) ACPI: Unable to start the ACPI Interpreter Later all kind of devices fail... and bisect it down to this commit: commit b9c61b70075c87a8612624736faf4a2de5b1ed30 x86/pci: update pirq_enable_irq() to setup io apic routing it turns out we need to set irq routing for the sci on ioapic1 early. -v2: make it work without sparseirq too. -v3: fix checkpatch.pl warning, and cc to stable Reported-by: Thomas Renninger <trenn@suse.de> Bisected-by: Thomas Renninger <trenn@suse.de> Tested-by: Thomas Renninger <trenn@suse.de> Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-2-git-send-email-yinghai@kernel.org> Cc: stable@kernel.org Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/acpi/boot.c | 9 +++++++- arch/x86/kernel/apic/io_apic.c | 50 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdfa5ca4..5c96b75c6ea8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -446,6 +446,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { *irq = gsi; + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) + setup_IO_APIC_irq_extra(gsi); +#endif + return 0; } @@ -473,7 +479,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif - acpi_gsi_to_irq(plat_gsi, &irq); + irq = plat_gsi; + return irq; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816d..5e4cce254e43 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1538,6 +1538,56 @@ static void __init setup_IO_APIC_irqs(void) " (apicid-pin) not connected\n"); } +/* + * for the gsit that is not in first ioapic + * but could not use acpi_register_gsi() + * like some special sci in IBM x3330 + */ +void setup_IO_APIC_irq_extra(u32 gsi) +{ + int apic_id = 0, pin, idx, irq; + int node = cpu_to_node(boot_cpu_id); + struct irq_desc *desc; + struct irq_cfg *cfg; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + apic_id = mp_find_ioapic(gsi); + if (apic_id < 0) + return; + + pin = mp_find_ioapic_pin(apic_id, gsi); + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) + return; + + irq = pin_2_irq(idx, apic_id, pin); +#ifdef CONFIG_SPARSE_IRQ + desc = irq_to_desc(irq); + if (desc) + return; +#endif + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + return; + } + + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + + if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[apic_id].apicid, pin); + return; + } + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); +} + /* * Set up the timer pin, possibly with the 8259A-master behind. */ -- cgit v1.2.2 From ced5b697a76d325e7a7ac7d382dbbb632c765093 Mon Sep 17 00:00:00 2001 From: Brandon Phiilps <bphilips@suse.de> Date: Wed, 10 Feb 2010 01:20:06 -0800 Subject: x86: Avoid race condition in pci_enable_msix() Keep chip_data in create_irq_nr and destroy_irq. When two drivers are setting up MSI-X at the same time via pci_enable_msix() there is a race. See this dmesg excerpt: [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X [ 85.170611] alloc irq_desc for 99 on node -1 [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X [ 85.170614] alloc kstat_irqs on node -1 [ 85.170616] alloc irq_2_iommu on node -1 [ 85.170617] alloc irq_desc for 100 on node -1 [ 85.170619] alloc kstat_irqs on node -1 [ 85.170621] alloc irq_2_iommu on node -1 [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X [ 85.170626] alloc irq_desc for 101 on node -1 [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X [ 85.170630] alloc kstat_irqs on node -1 [ 85.170631] alloc irq_2_iommu on node -1 [ 85.170635] alloc irq_desc for 102 on node -1 [ 85.170636] alloc kstat_irqs on node -1 [ 85.170639] alloc irq_2_iommu on node -1 [ 85.170646] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 As you can see igb and ixgbe are both alternating on create_irq_nr() via pci_enable_msix() in their probe function. ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data = NULL via dynamic_irq_init(). igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[] via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this: cfg_new = irq_desc_ptrs[102]->chip_data; if (cfg_new->vector != 0) continue; This hits the NULL deref. Another possible race exists via pci_disable_msix() in a driver or in the number of error paths that call free_msi_irqs(): destroy_irq() dynamic_irq_cleanup() which sets desc->chip_data = NULL ...race window... desc->chip_data = cfg; Remove the save and restore code for cfg in create_irq_nr() and destroy_irq() and take the desc->lock when checking the irq_cfg. Reported-and-analyzed-by: Brandon Philips <bphilips@suse.de> Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-3-git-send-email-yinghai@kernel.org> Signed-off-by: Brandon Phililps <bphilips@suse.de> Cc: stable@kernel.org Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816d..c86591b906fa 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3228,12 +3228,9 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) { - dynamic_irq_init(irq); - /* restore it, in case dynamic_irq_init clear it */ - if (desc_new) - desc_new->chip_data = cfg_new; - } + if (irq > 0) + dynamic_irq_init_keep_chip_data(irq); + return irq; } @@ -3256,17 +3253,12 @@ void destroy_irq(unsigned int irq) { unsigned long flags; struct irq_cfg *cfg; - struct irq_desc *desc; - /* store it, in case dynamic_irq_cleanup clear it */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - dynamic_irq_cleanup(irq); - /* connect back irq_cfg */ - desc->chip_data = cfg; + dynamic_irq_cleanup_keep_chip_data(irq); free_irte(irq); spin_lock_irqsave(&vector_lock, flags); + cfg = irq_to_desc(irq)->chip_data; __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); } -- cgit v1.2.2 From 27811d8cabe56e0c3622251b049086f49face4ff Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:07 -0800 Subject: x86: Move range related operation to one file We have almost the same code for mtrr cleanup and amd_bus checkup, and this code will also be used in replacing bootmem with early_res, so try to move them together and reuse it from different parts. Also rename update_range to subtract_range as that is what the function is actually doing. -v2: update comments as Christoph requested Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-4-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/mtrr/cleanup.c | 180 ++++--------------------------------- arch/x86/kernel/mmconf-fam10h_64.c | 7 +- 2 files changed, 17 insertions(+), 170 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 09b1698e0466..669da09ab9a8 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -22,10 +22,10 @@ #include <linux/pci.h> #include <linux/smp.h> #include <linux/cpu.h> -#include <linux/sort.h> #include <linux/mutex.h> #include <linux/uaccess.h> #include <linux/kvm_para.h> +#include <linux/range.h> #include <asm/processor.h> #include <asm/e820.h> @@ -34,11 +34,6 @@ #include "mtrr.h" -struct res_range { - unsigned long start; - unsigned long end; -}; - struct var_mtrr_range_state { unsigned long base_pfn; unsigned long size_pfn; @@ -56,7 +51,7 @@ struct var_mtrr_state { /* Should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 -static struct res_range __initdata range[RANGE_NUM]; +static struct range __initdata range[RANGE_NUM]; static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; @@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; #define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) - -static int __init -add_range(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - /* Out of slots: */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -static int __init -add_range_with_merge(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - int i; - - /* Try to merge it with old one: */ - for (i = 0; i < nr_range; i++) { - unsigned long final_start, final_end; - unsigned long common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end + 1) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* Need to add it: */ - return add_range(range, nr_range, start, end); -} - -static void __init -subtract_range(struct res_range *range, unsigned long start, unsigned long end) -{ - int i, j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* Find the new spare: */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static int __init cmp_range(const void *x1, const void *x2) -{ - const struct res_range *r1 = x1; - const struct res_range *r2 = x2; - long start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - -static int __init clean_sort_range(struct res_range *range, int az) -{ - int i, j, k = az - 1, nr_range = 0; - - for (i = 0; i < k; i++) { - if (range[i].end) - continue; - for (j = k; j > i; j--) { - if (range[j].end) { - k = j; - break; - } - } - if (j == i) - break; - range[i].start = range[k].start; - range[i].end = range[k].end; - range[k].start = 0; - range[k].end = 0; - k--; - } - /* count it */ - for (i = 0; i < az; i++) { - if (!range[i].end) { - nr_range = i; - break; - } - } - - /* sort them */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - - return nr_range; -} - #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, +x86_get_mtrr_mem_range(struct range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) { @@ -223,13 +77,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, continue; base = range_state[i].base_pfn; size = range_state[i].size_pfn; - nr_range = add_range_with_merge(range, nr_range, base, - base + size - 1); + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size - 1); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } @@ -252,10 +106,10 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size - 1); } if (extra_remove_size) - subtract_range(range, extra_remove_base, + subtract_range(range, RANGE_NUM, extra_remove_base, extra_remove_base + extra_remove_size - 1); if (debug_print) { @@ -263,7 +117,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } } @@ -273,20 +127,16 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } - /* clear those is not used */ - for (i = nr_range; i < RANGE_NUM; i++) - memset(&range[i], 0, sizeof(range[i])); - return nr_range; } #ifdef CONFIG_MTRR_SANITIZER -static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +static unsigned long __init sum_ranges(struct range *range, int nr_range) { unsigned long sum = 0; int i; @@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg) early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init -x86_setup_var_mtrrs(struct res_range *range, int nr_range, +x86_setup_var_mtrrs(struct range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; @@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size, unsigned long x_remove_base, unsigned long x_remove_size, int i) { - static struct res_range range_new[RANGE_NUM]; + static struct range range_new[RANGE_NUM]; unsigned long range_sums_new; static int nr_range_new; int num_reg; @@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits) * [0, 1M) should always be covered by var mtrr with WB * and fixed mtrrs should take effect before var mtrr for it: */ - nr_range = add_range_with_merge(range, nr_range, 0, + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); /* Sort the ranges: */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + sort_range(range, nr_range); range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM covered: %ldM\n", diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 712d15fdc416..71825806cd44 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -7,6 +7,8 @@ #include <linux/string.h> #include <linux/pci.h> #include <linux/dmi.h> +#include <linux/range.h> + #include <asm/pci-direct.h> #include <linux/sort.h> #include <asm/io.h> @@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, }; -struct range { - u64 start; - u64 end; -}; - static int __cpuinit cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; -- cgit v1.2.2 From e9a0064ad03b899938059bb576615ad9ed0f27f9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:13 -0800 Subject: x86: Change range end to start+size So make interface more consistent with early_res. Later we can share some code with early_res. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-10-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/mtrr/cleanup.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 669da09ab9a8..06130b52f012 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -78,13 +78,13 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, base = range_state[i].base_pfn; size = range_state[i].size_pfn; nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, - base, base + size - 1); + base, base + size); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } /* Take out UC ranges: */ @@ -106,11 +106,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, RANGE_NUM, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size); } if (extra_remove_size) subtract_range(range, RANGE_NUM, extra_remove_base, - extra_remove_base + extra_remove_size - 1); + extra_remove_base + extra_remove_size); if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); @@ -118,7 +118,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, if (!range[i].end) continue; printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } } @@ -128,7 +128,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } return nr_range; @@ -142,7 +142,7 @@ static unsigned long __init sum_ranges(struct range *range, int nr_range) int i; for (i = 0; i < nr_range; i++) - sum += range[i].end + 1 - range[i].start; + sum += range[i].end - range[i].start; return sum; } @@ -489,7 +489,7 @@ x86_setup_var_mtrrs(struct range *range, int nr_range, /* Write the range: */ for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, - range[i].end - range[i].start + 1); + range[i].end - range[i].start); } /* Write the last range: */ @@ -720,7 +720,7 @@ int __init mtrr_cleanup(unsigned address_bits) * and fixed mtrrs should take effect before var mtrr for it: */ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, - (1ULL<<(20 - PAGE_SHIFT)) - 1); + 1ULL<<(20 - PAGE_SHIFT)); /* Sort the ranges: */ sort_range(range, nr_range); @@ -939,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) nr_range = 0; if (mtrr_tom2) { range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); - range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; - if (highest_pfn < range[nr_range].end + 1) - highest_pfn = range[nr_range].end + 1; + range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT; + if (highest_pfn < range[nr_range].end) + highest_pfn = range[nr_range].end; nr_range++; } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); @@ -953,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { - if (range[i].end + 1 < range[i+1].start) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end, range[i+1].start); } /* Check the top: */ i = nr_range - 1; - if (range[i].end + 1 < end_pfn) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < end_pfn) + total_trim_size += real_trim_memory(range[i].end, end_pfn); if (total_trim_size) { -- cgit v1.2.2 From 79c601695870ca2a9c0ba9949a97d2be78ec07b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:14 -0800 Subject: x86: Print out RAM buffer information So we can check that early in the bootlog. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-11-git-send-email-yinghai@kernel.org> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a966b753e496..f406efeb4dc4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1429,6 +1429,8 @@ void __init e820_reserve_resources_late(void) end = MAX_RESOURCE_SIZE; if (start >= end) continue; + printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", + start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } -- cgit v1.2.2 From 1842f90cc98625d4d9bf8f8b927f17705ceb4e9c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:15 -0800 Subject: x86: Call early_res_to_bootmem one time Simplify setup_node_mem: don't use bootmem from other node, instead just find_e820_area in early_node_mem. This keeps the boundary between early_res and boot mem more clear, and lets us only call early_res_to_bootmem() one time instead of for all nodes. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-12-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3499b4fabc94..48cadbb1d28b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -967,6 +967,7 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); + early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); #ifdef CONFIG_X86_64 /* -- cgit v1.2.2 From 264ebb182e85f30aa473fa2189d5d5ea173ec3ab Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:16 -0800 Subject: x86: Introduce max_early_res and early_res_count To prepare allocate early res array from fine_e820_area. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-13-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f406efeb4dc4..7053f4adb8ed 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -732,14 +732,18 @@ core_initcall(e820_mark_nvs_memory); /* * Early reserved memory areas. */ -#define MAX_EARLY_RES 32 +/* + * need to make sure this one is bigger enough before + * find_e820_area could be used + */ +#define MAX_EARLY_RES_X 32 struct early_res { u64 start, end; - char name[16]; + char name[15]; char overlap_ok; }; -static struct early_res early_res[MAX_EARLY_RES] __initdata = { +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = { { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ #if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) /* @@ -753,12 +757,22 @@ static struct early_res early_res[MAX_EARLY_RES] __initdata = { {} }; +static int max_early_res __initdata = MAX_EARLY_RES_X; +static struct early_res *early_res __initdata = &early_res_x[0]; +static int early_res_count __initdata = +#ifdef CONFIG_X86_32 + 2 +#else + 1 +#endif + ; + static int __init find_overlapped_early(u64 start, u64 end) { int i; struct early_res *r; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { r = &early_res[i]; if (end > r->start && start < r->end) break; @@ -776,13 +790,14 @@ static void __init drop_range(int i) { int j; - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + for (j = i + 1; j < max_early_res && early_res[j].end; j++) ; memmove(&early_res[i], &early_res[i + 1], (j - 1 - i) * sizeof(struct early_res)); early_res[j - 1].end = 0; + early_res_count--; } /* @@ -801,9 +816,9 @@ static void __init drop_overlaps_that_are_ok(u64 start, u64 end) struct early_res *r; u64 lower_start, lower_end; u64 upper_start, upper_end; - char name[16]; + char name[15]; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { r = &early_res[i]; /* Continue past non-overlapping ranges */ @@ -859,7 +874,7 @@ static void __init __reserve_early(u64 start, u64 end, char *name, struct early_res *r; i = find_overlapped_early(start, end); - if (i >= MAX_EARLY_RES) + if (i >= max_early_res) panic("Too many early reservations"); r = &early_res[i]; if (r->end) @@ -872,6 +887,7 @@ static void __init __reserve_early(u64 start, u64 end, char *name, r->overlap_ok = overlap_ok; if (name) strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; } /* @@ -924,7 +940,7 @@ void __init free_early(u64 start, u64 end) i = find_overlapped_early(start, end); r = &early_res[i]; - if (i >= MAX_EARLY_RES || r->end != end || r->start != start) + if (i >= max_early_res || r->end != end || r->start != start) panic("free_early on not reserved area: %llx-%llx!", start, end - 1); @@ -935,14 +951,15 @@ void __init early_res_to_bootmem(u64 start, u64 end) { int i, count; u64 final_start, final_end; + int idx = 0; count = 0; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) + for (i = 0; i < max_early_res && early_res[i].end; i++) count++; - printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count, start, end); - for (i = 0; i < count; i++) { + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count - idx, max_early_res, start, end); + for (i = idx; i < count; i++) { struct early_res *r = &early_res[i]; printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, r->start, r->end, r->name); @@ -969,7 +986,7 @@ static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) again: i = find_overlapped_early(addr, addr + size); r = &early_res[i]; - if (i < MAX_EARLY_RES && r->end) { + if (i < max_early_res && r->end) { *addrp = addr = round_up(r->end, align); changed = 1; goto again; @@ -986,7 +1003,7 @@ static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) int changed = 0; again: last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { struct early_res *r = &early_res[i]; if (last > r->start && addr < r->start) { size = r->start - addr; -- cgit v1.2.2 From 28b1c57d3c1f8df69c958f2ae7b9e4b67538ff4d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:17 -0800 Subject: x86: Dynamically increase early_res array size Use early_res_count to track the num, and use find_e820 to get a new buffer, then copy from the old to the new one. Also, clear early_res to prevent later invalid usage. -v2 _check_and_double_early_res should take new start Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-14-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7053f4adb8ed..e09c18c8f3c1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -916,6 +916,48 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } +static void __init __check_and_double_early_res(u64 start) +{ + u64 end, size, mem; + struct early_res *new; + + /* do we have enough slots left ? */ + if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) + return; + + /* double it */ + end = max_pfn_mapped << PAGE_SHIFT; + size = sizeof(struct early_res) * max_early_res * 2; + mem = find_e820_area(start, end, size, sizeof(struct early_res)); + + if (mem == -1ULL) + panic("can not find more space for early_res array"); + + new = __va(mem); + /* save the first one for own */ + new[0].start = mem; + new[0].end = mem + size; + new[0].overlap_ok = 0; + /* copy old to new */ + if (early_res == early_res_x) { + memcpy(&new[1], &early_res[0], + sizeof(struct early_res) * max_early_res); + memset(&new[max_early_res+1], 0, + sizeof(struct early_res) * (max_early_res - 1)); + early_res_count++; + } else { + memcpy(&new[1], &early_res[1], + sizeof(struct early_res) * (max_early_res - 1)); + memset(&new[max_early_res], 0, + sizeof(struct early_res) * max_early_res); + } + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = new; + max_early_res *= 2; + printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", + max_early_res, mem, mem + size - 1); +} + /* * Most early reservations come here. * @@ -929,6 +971,8 @@ void __init reserve_early(u64 start, u64 end, char *name) if (start >= end) return; + __check_and_double_early_res(end); + drop_overlaps_that_are_ok(start, end); __reserve_early(start, end, name, 0); } @@ -957,6 +1001,10 @@ void __init early_res_to_bootmem(u64 start, u64 end) for (i = 0; i < max_early_res && early_res[i].end; i++) count++; + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", count - idx, max_early_res, start, end); for (i = idx; i < count; i++) { @@ -974,6 +1022,11 @@ void __init early_res_to_bootmem(u64 start, u64 end) reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); } + /* clear them */ + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + early_res_count = 0; } /* Check for already reserved areas */ -- cgit v1.2.2 From c252a5bb1f57afb1e336d68085217727ca7b2134 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:19 -0800 Subject: x86: Only call dma32_reserve_bootmem 64bit !CONFIG_NUMA 64bit NUMA already make enough space under 4G with new early_node_mem. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-16-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/pci-dma.c | 13 ++++++++++--- arch/x86/kernel/setup.c | 7 ------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 75e14e21f61a..1aa966c565f9 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -65,7 +65,7 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA) static __initdata void *dma32_bootmem_ptr; static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); @@ -116,14 +116,21 @@ static void __init dma32_free_bootmem(void) dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; } +#else +void __init dma32_reserve_bootmem(void) +{ +} +static void __init dma32_free_bootmem(void) +{ +} + #endif void __init pci_iommu_alloc(void) { -#ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); -#endif + if (pci_swiotlb_detect()) goto out; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 48cadbb1d28b..ea4141b48518 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -969,14 +969,7 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn, acpi, k8); early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); -#ifdef CONFIG_X86_64 - /* - * dma32_reserve_bootmem() allocates bootmem which may conflict - * with the crashkernel command line, so do that after - * reserve_crashkernel() - */ dma32_reserve_bootmem(); -#endif reserve_ibft_region(); -- cgit v1.2.2 From 5b3efd500854d45d305b53c54c97db5970959980 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Thu, 11 Feb 2010 11:50:59 -0800 Subject: x86, ptrace: regset extensions to support xstate Add the xstate regset support which helps extend the kernel ptrace and the core-dump interfaces to support AVX state etc. This regset interface is designed to support all the future state that gets supported using xsave/xrstor infrastructure. Looking at the memory layout saved by "xsave", one can't say which state is represented in the memory layout. This is because if a particular state is in init state, in the xsave hdr it can be represented by bit '0'. And hence we can't really say by the xsave header wether a state is in init state or the state is not saved in the memory layout. And hence the xsave memory layout available through this regset interface uses SW usable bytes [464..511] to convey what state is represented in the memory layout. First 8 bytes of the sw_usable_bytes[464..467] will be set to OS enabled xstate mask(which is same as the 64bit mask returned by the xgetbv's xCR0). The note NT_X86_XSTATE represents the extended state information in the core file, using the above mentioned memory layout. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <20100211195614.802495327@sbs-t61.sc.intel.com> Signed-off-by: Hongjiu Lu <hjl.tools@gmail.com> Cc: Roland McGrath <roland@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/i387.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 34 ++++++++++++++++++-- arch/x86/kernel/xsave.c | 1 + 3 files changed, 116 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f2f8540a7f3d..7a8a193b5144 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -164,6 +164,11 @@ int init_fpu(struct task_struct *tsk) return 0; } +/* + * The xstateregs_active() routine is the same as the fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ int fpregs_active(struct task_struct *target, const struct user_regset *regset) { return tsk_used_math(target) ? regset->n : 0; @@ -224,6 +229,84 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, return ret; } +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + /* + * First copy the fxsave bytes 0..463. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, + offsetof(struct user_xstateregs, + i387.xstate_fx_sw)); + if (ret) + return ret; + + /* + * Copy the 48bytes defined by software. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + xstate_fx_sw_bytes, + offsetof(struct user_xstateregs, + i387.xstate_fx_sw), + offsetof(struct user_xstateregs, + xsave_hdr)); + if (ret) + return ret; + + /* + * Copy the rest of xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave.xsave_hdr, + offsetof(struct user_xstateregs, + xsave_hdr), -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct xsave_hdr_struct *xsave_hdr; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + + xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + return ret; +} + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..16433a59b396 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -48,6 +48,7 @@ enum x86_regset { REGSET_FP, REGSET_XFP, REGSET_IOPERM64 = REGSET_XFP, + REGSET_XSTATE, REGSET_TLS, REGSET_IOPERM32, }; @@ -1584,7 +1585,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static const struct user_regset x86_64_regsets[] = { +static struct user_regset x86_64_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1597,6 +1598,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_IOPERM64] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, @@ -1622,7 +1629,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static const struct user_regset x86_32_regsets[] = { +static struct user_regset x86_32_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1641,6 +1648,12 @@ static const struct user_regset x86_32_regsets[] = { .size = sizeof(u32), .align = sizeof(u32), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, @@ -1663,6 +1676,23 @@ static const struct user_regset_view user_x86_32_view = { }; #endif +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index c5ee17e8c6d9..782c3a362ec6 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void) cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; + update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); setup_xstate_init(); -- cgit v1.2.2 From 08677214e318297f228237be0042aac754f48f1d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:20 -0800 Subject: x86: Make 64 bit use early_res instead of bootmem before slab Finally we can use early_res to replace bootmem for x86_64 now. Still can use CONFIG_NO_BOOTMEM to enable it or not. -v2: fix 32bit compiling about MAX_DMA32_PFN -v3: folded bug fix from LKML message below Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <4B747239.4070907@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 159 ++++++++++++++++++++++++++++++++++++++++++++---- arch/x86/kernel/setup.c | 2 + 2 files changed, 148 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e09c18c8f3c1..90a85295f332 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -977,6 +977,25 @@ void __init reserve_early(u64 start, u64 end, char *name) __reserve_early(start, end, name, 0); } +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + void __init free_early(u64 start, u64 end) { struct early_res *r; @@ -991,6 +1010,94 @@ void __init free_early(u64 start, u64 end) drop_range(i); } +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#if 1 + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if 0 + printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) { +#if 0 + printk(KERN_CONT "\n"); +#endif + continue; + } +#if 0 + printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n", + final_start, final_end); +#endif + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; +#ifdef MAX_DMA32_PFN + if (max_pfn_mapped > MAX_DMA32_PFN) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + end = max_pfn_mapped << PAGE_SHIFT; + mem = find_e820_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else void __init early_res_to_bootmem(u64 start, u64 end) { int i, count; @@ -1028,6 +1135,7 @@ void __init early_res_to_bootmem(u64 start, u64 end) max_early_res = 0; early_res_count = 0; } +#endif /* Check for already reserved areas */ static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) @@ -1081,6 +1189,35 @@ again: return changed; } +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + /* * Find a free area with specified alignment in a specific range. */ @@ -1090,24 +1227,20 @@ u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + u64 addr; + u64 ei_start, ei_last; if (ei->type != E820_RAM) continue; - addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - continue; - if (last > end) + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr == -1ULL) continue; + return addr; } return -1ULL; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ea4141b48518..d49e168bda8c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -967,7 +967,9 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); +#ifndef CONFIG_NO_BOOTMEM early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); +#endif dma32_reserve_bootmem(); -- cgit v1.2.2 From db8f77c889542b09457b8b97efb311343c99a75d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:23 -0800 Subject: x86: Move bios page reserve early to head32/64.c So prepare to make one more clean of early_res.c. -v2: don't need to reserve first page in early_res because we already mark that in e820 as reserved already. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-20-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 22 ++-------------------- arch/x86/kernel/head32.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 90a85295f332..4004f10285d1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -743,29 +743,11 @@ struct early_res { char name[15]; char overlap_ok; }; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = { - { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, -#endif - - {} -}; +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; static int max_early_res __initdata = MAX_EARLY_RES_X; static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata = -#ifdef CONFIG_X86_32 - 2 -#else - 1 -#endif - ; +static int early_res_count __initdata; static int __init find_overlapped_early(u64 start, u64 end) { diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 5051b94c9069..adedeef1dedc 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,6 +29,16 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { +#ifdef CONFIG_X86_TRAMPOLINE + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, + "EX TRAMPOLINE"); +#endif + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD -- cgit v1.2.2 From a678c2be75773e112f6d656a22a7f1645c4dbd6c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:24 -0800 Subject: x86: Separate early_res related code from e820.c ... to make e820.c smaller. -v2: fix 32bit compiling with MAX_DMA32_PFN Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-21-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 541 +------------------------------------------- arch/x86/kernel/early_res.c | 538 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 540 insertions(+), 541 deletions(-) create mode 100644 arch/x86/kernel/early_res.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09bc5a52..f5fb9f0b6277 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -38,7 +38,7 @@ obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o -obj-y += bootflag.o e820.o +obj-y += bootflag.o e820.o early_res.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4004f10285d1..82db4015604e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -12,21 +12,14 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/string.h> -#include <linux/kexec.h> -#include <linux/module.h> -#include <linux/mm.h> #include <linux/pfn.h> #include <linux/suspend.h> #include <linux/firmware-map.h> -#include <asm/pgtable.h> -#include <asm/page.h> #include <asm/e820.h> +#include <asm/early_res.h> #include <asm/proto.h> #include <asm/setup.h> -#include <asm/trampoline.h> /* * The e820 map is the map that gets modified e.g. with command line parameters @@ -729,538 +722,6 @@ static int __init e820_mark_nvs_memory(void) core_initcall(e820_mark_nvs_memory); #endif -/* - * Early reserved memory areas. - */ -/* - * need to make sure this one is bigger enough before - * find_e820_area could be used - */ -#define MAX_EARLY_RES_X 32 - -struct early_res { - u64 start, end; - char name[15]; - char overlap_ok; -}; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; - -static int max_early_res __initdata = MAX_EARLY_RES_X; -static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < max_early_res && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; - early_res_count--; -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. - */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) -{ - int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[15]; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) - continue; - - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= max_early_res) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name?name:"", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -static void __init __check_and_double_early_res(u64 start) -{ - u64 end, size, mem; - struct early_res *new; - - /* do we have enough slots left ? */ - if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) - return; - - /* double it */ - end = max_pfn_mapped << PAGE_SHIFT; - size = sizeof(struct early_res) * max_early_res * 2; - mem = find_e820_area(start, end, size, sizeof(struct early_res)); - - if (mem == -1ULL) - panic("can not find more space for early_res array"); - - new = __va(mem); - /* save the first one for own */ - new[0].start = mem; - new[0].end = mem + size; - new[0].overlap_ok = 0; - /* copy old to new */ - if (early_res == early_res_x) { - memcpy(&new[1], &early_res[0], - sizeof(struct early_res) * max_early_res); - memset(&new[max_early_res+1], 0, - sizeof(struct early_res) * (max_early_res - 1)); - early_res_count++; - } else { - memcpy(&new[1], &early_res[1], - sizeof(struct early_res) * (max_early_res - 1)); - memset(&new[max_early_res], 0, - sizeof(struct early_res) * max_early_res); - } - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = new; - max_early_res *= 2; - printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", - max_early_res, mem, mem + size - 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - __check_and_double_early_res(end); - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init reserve_early_without_check(u64 start, u64 end, char *name) -{ - struct early_res *r; - - if (start >= end) - return; - - __check_and_double_early_res(end); - - r = &early_res[early_res_count]; - - r->start = start; - r->end = end; - r->overlap_ok = 0; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= max_early_res || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -#ifdef CONFIG_NO_BOOTMEM -static void __init subtract_early_res(struct range *range, int az) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - -#if 1 - printk(KERN_INFO "Subtract (%d early reservations)\n", count); -#endif - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; -#if 0 - printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i, - r->start, r->end, r->name); -#endif - final_start = PFN_DOWN(r->start); - final_end = PFN_UP(r->end); - if (final_start >= final_end) { -#if 0 - printk(KERN_CONT "\n"); -#endif - continue; - } -#if 0 - printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n", - final_start, final_end); -#endif - subtract_range(range, az, final_start, final_end); - } - -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - int i, count; - u64 start = 0, end; - u64 size; - u64 mem; - struct range *range; - int nr_range; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - count *= 2; - - size = sizeof(struct range) * count; -#ifdef MAX_DMA32_PFN - if (max_pfn_mapped > MAX_DMA32_PFN) - start = MAX_DMA32_PFN << PAGE_SHIFT; -#endif - end = max_pfn_mapped << PAGE_SHIFT; - mem = find_e820_area(start, end, size, sizeof(struct range)); - if (mem == -1ULL) - panic("can not find more space for range free"); - - range = __va(mem); - /* use early_node_map[] and early_res to get range array at first */ - memset(range, 0, size); - nr_range = 0; - - /* need to go over early_node_map to find out good range for node */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); - subtract_early_res(range, count); - nr_range = clean_sort_range(range, count); - - /* need to clear it ? */ - if (nodeid == MAX_NUMNODES) { - memset(&early_res[0], 0, - sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - } - - *rangep = range; - return nr_range; -} -#else -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - - printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count - idx, max_early_res, start, end); - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } - /* clear them */ - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - early_res_count = 0; -} -#endif - -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < max_early_res && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) -{ - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < max_early_res && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - * only with the area.between start to end is active range from early_node_map - * so they are good as RAM - */ -u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - goto out; - if (last > end) - goto out; - - return addr; - -out: - return -1ULL; -} - -/* - * Find a free area with specified alignment in a specific range. - */ -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area(ei_start, ei_last, start, end, - size, align); - - if (addr == -1ULL) - continue; - - return addr; - } - return -1ULL; -} - -/* - * Find next free range after *start - */ -u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; - - if (ei->type != E820_RAM) - continue; - addr = round_up(ei->addr, align); - ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - continue; - return addr; - } - - return -1ULL; -} - /* * pre allocated 4k and reserved it in e820 */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c new file mode 100644 index 000000000000..1cf2c2f9ea68 --- /dev/null +++ b/arch/x86/kernel/early_res.c @@ -0,0 +1,538 @@ +/* + * early_res, could be used to replace bootmem + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mm.h> + +#include <asm/e820.h> +#include <asm/early_res.h> +#include <asm/proto.h> + +/* + * Early reserved memory areas. + */ +/* + * need to make sure this one is bigger enough before + * find_e820_area could be used + */ +#define MAX_EARLY_RES_X 32 + +struct early_res { + u64 start, end; + char name[15]; + char overlap_ok; +}; +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; + +static int max_early_res __initdata = MAX_EARLY_RES_X; +static struct early_res *early_res __initdata = &early_res_x[0]; +static int early_res_count __initdata; + +static int __init find_overlapped_early(u64 start, u64 end) +{ + int i; + struct early_res *r; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + break; + } + + return i; +} + +/* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and + * clearing what had been the last slot. + */ +static void __init drop_range(int i) +{ + int j; + + for (j = i + 1; j < max_early_res && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; + early_res_count--; +} + +/* + * Split any existing ranges that: + * 1) are marked 'overlap_ok', and + * 2) overlap with the stated range [start, end) + * into whatever portion (if any) of the existing range is entirely + * below or entirely above the stated range. Drop the portion + * of the existing range that overlaps with the stated range, + * which will allow the caller of this routine to then add that + * stated range without conflicting with any existing range. + */ +static void __init drop_overlaps_that_are_ok(u64 start, u64 end) +{ + int i; + struct early_res *r; + u64 lower_start, lower_end; + u64 upper_start, upper_end; + char name[15]; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + + /* Continue past non-overlapping ranges */ + if (end <= r->start || start >= r->end) + continue; + + /* + * Leave non-ok overlaps as is; let caller + * panic "Overlapping early reservations" + * when it hits this overlap. + */ + if (!r->overlap_ok) + return; + + /* + * We have an ok overlap. We will drop it from the early + * reservation map, and add back in any non-overlapping + * portions (lower or upper) as separate, overlap_ok, + * non-overlapping ranges. + */ + + /* 1. Note any non-overlapping (lower or upper) ranges. */ + strncpy(name, r->name, sizeof(name) - 1); + + lower_start = lower_end = 0; + upper_start = upper_end = 0; + if (r->start < start) { + lower_start = r->start; + lower_end = start; + } + if (r->end > end) { + upper_start = end; + upper_end = r->end; + } + + /* 2. Drop the original ok overlapping range */ + drop_range(i); + + i--; /* resume for-loop on copied down entry */ + + /* 3. Add back in any non-overlapping ranges. */ + if (lower_end) + reserve_early_overlap_ok(lower_start, lower_end, name); + if (upper_end) + reserve_early_overlap_ok(upper_start, upper_end, name); + } +} + +static void __init __reserve_early(u64 start, u64 end, char *name, + int overlap_ok) +{ + int i; + struct early_res *r; + + i = find_overlapped_early(start, end); + if (i >= max_early_res) + panic("Too many early reservations"); + r = &early_res[i]; + if (r->end) + panic("Overlapping early reservations " + "%llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name ? name : "", r->start, + r->end - 1, r->name); + r->start = start; + r->end = end; + r->overlap_ok = overlap_ok; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +/* + * A few early reservtations come here. + * + * The 'overlap_ok' in the name of this routine does -not- mean it + * is ok for these reservations to overlap an earlier reservation. + * Rather it means that it is ok for subsequent reservations to + * overlap this one. + * + * Use this entry point to reserve early ranges when you are doing + * so out of "Paranoia", reserving perhaps more memory than you need, + * just in case, and don't mind a subsequent overlapping reservation + * that is known to be needed. + * + * The drop_overlaps_that_are_ok() call here isn't really needed. + * It would be needed if we had two colliding 'overlap_ok' + * reservations, so that the second such would not panic on the + * overlap with the first. We don't have any such as of this + * writing, but might as well tolerate such if it happens in + * the future. + */ +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) +{ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 1); +} + +static void __init __check_and_double_early_res(u64 start) +{ + u64 end, size, mem; + struct early_res *new; + + /* do we have enough slots left ? */ + if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) + return; + + /* double it */ + end = max_pfn_mapped << PAGE_SHIFT; + size = sizeof(struct early_res) * max_early_res * 2; + mem = find_e820_area(start, end, size, sizeof(struct early_res)); + + if (mem == -1ULL) + panic("can not find more space for early_res array"); + + new = __va(mem); + /* save the first one for own */ + new[0].start = mem; + new[0].end = mem + size; + new[0].overlap_ok = 0; + /* copy old to new */ + if (early_res == early_res_x) { + memcpy(&new[1], &early_res[0], + sizeof(struct early_res) * max_early_res); + memset(&new[max_early_res+1], 0, + sizeof(struct early_res) * (max_early_res - 1)); + early_res_count++; + } else { + memcpy(&new[1], &early_res[1], + sizeof(struct early_res) * (max_early_res - 1)); + memset(&new[max_early_res], 0, + sizeof(struct early_res) * max_early_res); + } + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = new; + max_early_res *= 2; + printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", + max_early_res, mem, mem + size - 1); +} + +/* + * Most early reservations come here. + * + * We first have drop_overlaps_that_are_ok() drop any pre-existing + * 'overlap_ok' ranges, so that we can then reserve this memory + * range without risk of panic'ing on an overlapping overlap_ok + * early reservation. + */ +void __init reserve_early(u64 start, u64 end, char *name) +{ + if (start >= end) + return; + + __check_and_double_early_res(end); + + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 0); +} + +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +void __init free_early(u64 start, u64 end) +{ + struct early_res *r; + int i; + + i = find_overlapped_early(start, end); + r = &early_res[i]; + if (i >= max_early_res || r->end != end || r->start != start) + panic("free_early on not reserved area: %llx-%llx!", + start, end - 1); + + drop_range(i); +} + +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#define DEBUG_PRINT_EARLY_RES 1 + +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) + continue; + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; +#ifdef MAX_DMA32_PFN + if (max_pfn_mapped > MAX_DMA32_PFN) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + end = max_pfn_mapped << PAGE_SHIFT; + mem = find_e820_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else +void __init early_res_to_bootmem(u64 start, u64 end) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count - idx, max_early_res, start, end); + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, + r->start, r->end, r->name); + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) { + printk(KERN_CONT "\n"); + continue; + } + printk(KERN_CONT " ==> [%010llx - %010llx]\n", + final_start, final_end); + reserve_bootmem_generic(final_start, final_end - final_start, + BOOTMEM_DEFAULT); + } + /* clear them */ + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + early_res_count = 0; +} +#endif + +/* Check for already reserved areas */ +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) +{ + int i; + u64 addr = *addrp; + int changed = 0; + struct early_res *r; +again: + i = find_overlapped_early(addr, addr + size); + r = &early_res[i]; + if (i < max_early_res && r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; + } + return changed; +} + +/* Check for already reserved areas */ +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +{ + int i; + u64 addr = *addrp, last; + u64 size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < max_early_res && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr == -1ULL) + continue; + + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && + addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + continue; + return addr; + } + + return -1ULL; +} -- cgit v1.2.2 From 7da657d1f1dd27fa9d8289d5f7e53479c7fd3a95 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:25 -0800 Subject: x86: Add find_early_area_size Prepare to move bck find_e820_area_size back to e820.c. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-22-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/early_res.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 1cf2c2f9ea68..bfa1ba705d48 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -476,6 +476,29 @@ out: return -1ULL; } +u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, + u64 *sizep, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + goto out; + + return addr; + +out: + return -1ULL; +} + /* * Find a free area with specified alignment in a specific range. */ @@ -513,24 +536,20 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + u64 addr; + u64 ei_start, ei_last; if (ei->type != E820_RAM) continue; - addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) + ei_start = ei->addr; + addr = find_early_area_size(ei_start, ei_last, start, + sizep, align); + + if (addr == -1ULL) continue; + return addr; } -- cgit v1.2.2 From efdd0e81df0f23830c6d2cb971cf87f415b8dbdb Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:26 -0800 Subject: x86: Move back find_e820_area to e820.c Makes early_res.c more clean, so later could move it to /kernel. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-23-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 53 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/early_res.c | 57 --------------------------------------------- 2 files changed, 53 insertions(+), 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 82db4015604e..b4e512b03aa7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -722,6 +722,59 @@ static int __init e820_mark_nvs_memory(void) core_initcall(e820_mark_nvs_memory); #endif +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr != -1ULL) + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area_size(ei_start, ei_last, start, + sizep, align); + + if (addr != -1ULL) + return addr; + } + + return -1ULL; +} + /* * pre allocated 4k and reserved it in e820 */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index bfa1ba705d48..1b99a2619f9f 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -498,60 +498,3 @@ u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, out: return -1ULL; } - -/* - * Find a free area with specified alignment in a specific range. - */ -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area(ei_start, ei_last, start, end, - size, align); - - if (addr == -1ULL) - continue; - - return addr; - } - return -1ULL; -} - -/* - * Find next free range after *start - */ -u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area_size(ei_start, ei_last, start, - sizep, align); - - if (addr == -1ULL) - continue; - - return addr; - } - - return -1ULL; -} -- cgit v1.2.2 From 53db62a2529280ff216c941d8a2650204547e44a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:27 -0800 Subject: early_res: Enhance check_and_double_early_res ... to make it always try to start from low at first. This makes it less likely for early_memtest to reserve a bad range, in particular it puts new early_res in a range that is already tested. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-24-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/early_res.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 1b99a2619f9f..dbf08bd01252 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -180,9 +180,9 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } -static void __init __check_and_double_early_res(u64 start) +static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) { - u64 end, size, mem; + u64 start, end, size, mem; struct early_res *new; /* do we have enough slots left ? */ @@ -190,10 +190,23 @@ static void __init __check_and_double_early_res(u64 start) return; /* double it */ - end = max_pfn_mapped << PAGE_SHIFT; + mem = -1ULL; size = sizeof(struct early_res) * max_early_res * 2; - mem = find_e820_area(start, end, size, sizeof(struct early_res)); - + if (early_res == early_res_x) + start = 0; + else + start = early_res[0].end; + end = ex_start; + if (start + size < end) + mem = find_e820_area(start, end, size, + sizeof(struct early_res)); + if (mem == -1ULL) { + start = ex_end; + end = max_pfn_mapped << PAGE_SHIFT; + if (start + size < end) + mem = find_e820_area(start, end, size, + sizeof(struct early_res)); + } if (mem == -1ULL) panic("can not find more space for early_res array"); @@ -235,7 +248,7 @@ void __init reserve_early(u64 start, u64 end, char *name) if (start >= end) return; - __check_and_double_early_res(end); + __check_and_double_early_res(start, end); drop_overlaps_that_are_ok(start, end); __reserve_early(start, end, name, 0); @@ -248,7 +261,7 @@ void __init reserve_early_without_check(u64 start, u64 end, char *name) if (start >= end) return; - __check_and_double_early_res(end); + __check_and_double_early_res(start, end); r = &early_res[early_res_count]; -- cgit v1.2.2 From 59be5a8e8ce765cf739ec7f07176219972de7481 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:28 -0800 Subject: x86: Make 32bit support NO_BOOTMEM Let's make 32bit consistent with 64bit. -v2: Andrew pointed out for 32bit that we should use -1ULL Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-25-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/early_res.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index dbf08bd01252..656cdf86a2fa 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -354,6 +354,9 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid) /* need to go over early_node_map to find out good range for node */ nr_range = add_from_early_node_map(range, count, nr_range, nodeid); +#ifdef CONFIG_X86_32 + subtract_range(range, count, max_low_pfn, -1ULL); +#endif subtract_early_res(range, count); nr_range = clean_sort_range(range, count); -- cgit v1.2.2 From dd645cee7b50b61cb2d05b59eb6027679c437af6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:30 -0800 Subject: x86: Add find_fw_memmap_area ... so we can move early_res up. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-27-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 4 ++++ arch/x86/kernel/early_res.c | 17 +++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b4e512b03aa7..36918d8463ab 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -748,6 +748,10 @@ u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) return -1ULL; } +u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) +{ + return find_e820_area(start, end, size, align); +} /* * Find next free range after *start */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 656cdf86a2fa..1458dc022343 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -7,16 +7,14 @@ #include <linux/bootmem.h> #include <linux/mm.h> -#include <asm/e820.h> #include <asm/early_res.h> -#include <asm/proto.h> /* * Early reserved memory areas. */ /* * need to make sure this one is bigger enough before - * find_e820_area could be used + * find_fw_memmap_area could be used */ #define MAX_EARLY_RES_X 32 @@ -180,6 +178,13 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } +u64 __init __weak find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) +{ + panic("should have find_fw_memmap_area defined with arch"); + + return -1ULL; +} + static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) { u64 start, end, size, mem; @@ -198,13 +203,13 @@ static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) start = early_res[0].end; end = ex_start; if (start + size < end) - mem = find_e820_area(start, end, size, + mem = find_fw_memmap_area(start, end, size, sizeof(struct early_res)); if (mem == -1ULL) { start = ex_end; end = max_pfn_mapped << PAGE_SHIFT; if (start + size < end) - mem = find_e820_area(start, end, size, + mem = find_fw_memmap_area(start, end, size, sizeof(struct early_res)); } if (mem == -1ULL) @@ -343,7 +348,7 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid) start = MAX_DMA32_PFN << PAGE_SHIFT; #endif end = max_pfn_mapped << PAGE_SHIFT; - mem = find_e820_area(start, end, size, sizeof(struct range)); + mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); if (mem == -1ULL) panic("can not find more space for range free"); -- cgit v1.2.2 From 414bb144efa2d2fe16d104d836d0d6b6e9265788 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Mon, 14 Dec 2009 13:08:41 +0100 Subject: x86, cpu: Print AMD virtualization features in /proc/cpuinfo This patch adds code to cpu initialization path to detect the extended virtualization features of AMD cpus to show them in /proc/cpuinfo. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> LKML-Reference: <1260792521-15212-1-git-send-email-joerg.roedel@amd.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/addon_cpuid_features.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 468489b57aae..97ad79cdf688 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, { 0, 0, 0, 0 } }; -- cgit v1.2.2 From 942fa3b63eb525aa0512ba28c42e656d8efc6787 Mon Sep 17 00:00:00 2001 From: Alan Cox <alan@linux.intel.com> Date: Mon, 8 Feb 2010 10:03:17 +0000 Subject: x86, mtrr: Kill over the top warn Fixes bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=12558 Fixes bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=12317 (and if this really needed to be a warn you'd be responding to the bugs left in bugzilla from it...) Signed-off-by: Alan Cox <alan@linux.intel.com> LKML-Reference: <20100208100239.2568.2940.stgit@localhost.localdomain> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 4d755846fee6..163e59e272d5 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -464,7 +464,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); mask_lo = tmp; } } -- cgit v1.2.2 From 97c169d39b6846a564dc8d883832e7fef9bdb77d Mon Sep 17 00:00:00 2001 From: Len Brown <len.brown@intel.com> Date: Tue, 16 Feb 2010 03:30:06 -0500 Subject: ACPI: remove Asus P2B-DS from acpi=ht blacklist We realized when we broke acpi=ht http://bugzilla.kernel.org/show_bug.cgi?id=14886 that acpi=ht is not needed on this box and folks have been using acpi=force on it anyway. Signed-off-by: Len Brown <len.brown@intel.com> --- arch/x86/kernel/acpi/boot.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdfa5ca4..af1c5833ff23 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1342,14 +1342,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), }, }, - { - .callback = force_acpi_ht, - .ident = "ASUS P2B-DS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), - }, - }, { .callback = force_acpi_ht, .ident = "ASUS CUR-DLS", -- cgit v1.2.2 From dade7716925a4e9a31f249f9ca1ed4e2f1495a8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 25 Jul 2009 18:39:36 +0200 Subject: x86: Convert ioapic_lock and vector_lock to raw_spinlock Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/apic/io_apic.c | 106 ++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 937150e4c06d..d55e43d352b3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -73,8 +73,8 @@ */ int sis_apic_bug = -1; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -393,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) struct irq_pin_list *entry; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; @@ -402,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return false; } @@ -420,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return eu.entry; } @@ -446,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -461,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin) unsigned long flags; union entry_union eu = { .entry.mask = 1 }; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -591,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) BUG_ON(!cfg); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __mask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) @@ -601,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) struct irq_cfg *cfg = desc->chip_data; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void mask_IO_APIC_irq(unsigned int irq) @@ -1127,12 +1127,12 @@ void lock_vector_lock(void) /* Used to the online set of cpus does not change * during assign_irq_vector. */ - spin_lock(&vector_lock); + raw_spin_lock(&vector_lock); } void unlock_vector_lock(void) { - spin_unlock(&vector_lock); + raw_spin_unlock(&vector_lock); } static int @@ -1220,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int err; unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); err = __assign_irq_vector(irq, cfg, mask); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } @@ -1265,7 +1265,7 @@ void __setup_vector_irq(int cpu) * assignments that might be happening on another cpu in parallel, * while we setup our initial vector to irq mappings. */ - spin_lock(&vector_lock); + raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; @@ -1284,7 +1284,7 @@ void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } - spin_unlock(&vector_lock); + raw_spin_unlock(&vector_lock); } static struct irq_chip ioapic_chip; @@ -1603,14 +1603,14 @@ __apicdebuginit(void) print_IO_APIC(void) for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); if (reg_01.bits.version >= 0x20) reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); @@ -1905,9 +1905,9 @@ void __init enable_IO_APIC(void) * The number of IO-APIC IRQ registers (== #pins): */ for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } @@ -2047,9 +2047,9 @@ void __init setup_ioapic_ids_from_mpc(void) for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); old_id = mp_ioapics[apic_id].apicid; @@ -2108,16 +2108,16 @@ void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid); reg_00.bits.ID = mp_ioapics[apic_id].apicid; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) printk("could not set ID!\n"); else @@ -2200,7 +2200,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) unsigned long flags; struct irq_cfg *cfg; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) @@ -2208,7 +2208,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) } cfg = irq_cfg(irq); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } @@ -2219,9 +2219,9 @@ static int ioapic_retrigger_irq(unsigned int irq) struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; } @@ -2314,14 +2314,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); ret = set_desc_affinity(desc, mask, &dest); if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -2549,9 +2549,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ack_apic_level(unsigned int irq) @@ -3133,13 +3133,13 @@ static int ioapic_resume(struct sys_device *dev) data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { reg_00.bits.ID = mp_ioapics[dev->id].apicid; io_apic_write(dev->id, 0, reg_00.raw); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ioapic_write_entry(dev->id, i, entry[i]); @@ -3202,7 +3202,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (irq_want < nr_irqs_gsi) irq_want = nr_irqs_gsi; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { @@ -3221,7 +3221,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) irq = new; break; } - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); if (irq > 0) { dynamic_irq_init(irq); @@ -3261,9 +3261,9 @@ void destroy_irq(unsigned int irq) desc->chip_data = cfg; free_irte(irq); - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); } /* @@ -3800,9 +3800,9 @@ int __init io_apic_get_redir_entries (int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.entries; } @@ -3964,9 +3964,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (physids_empty(apic_id_map)) apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (apic_id >= get_physical_broadcast()) { printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " @@ -4000,10 +4000,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0, reg_00.raw); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* Sanity check */ if (reg_00.bits.ID != apic_id) { @@ -4024,9 +4024,9 @@ int __init io_apic_get_version(int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } -- cgit v1.2.2 From 1252f238db48ec419f40c1bdf30fda649860eed9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Tue, 16 Feb 2010 15:02:13 +0100 Subject: x86: set_personality_ia32() misses force_personality32 05d43ed8a "x86: get rid of the insane TIF_ABI_PENDING bit" forgot about force_personality32. Fix. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/process_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 41a26a82470a..126f0b493d04 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -527,6 +527,7 @@ void set_personality_ia32(void) /* Make sure to be in 32bit mode */ set_thread_flag(TIF_IA32); + current->personality |= force_personality32; /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; -- cgit v1.2.2 From 40d6753e78a602bdf62e7741c0caa36474882f00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 25 Jul 2009 18:33:11 +0200 Subject: x86: Convert set_atomicity_lock to raw_spinlock Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/cpu/mtrr/generic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 163e59e272d5..9aa5dc76ff4a 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -570,7 +570,7 @@ static unsigned long set_mtrr_state(void) static unsigned long cr4; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts, @@ -590,7 +590,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) * changes to the way the kernel boots */ - spin_lock(&set_atomicity_lock); + raw_spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; @@ -627,7 +627,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(cr4); - spin_unlock(&set_atomicity_lock); + raw_spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) -- cgit v1.2.2 From 0fdc7a8022c3eaff6b5ee27ffb9e913e5e58d8e9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 25 Jul 2009 16:49:55 +0200 Subject: x86: Convert nmi_lock to raw_spinlock nmi_lock must be a spinning spinlock in -rt. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/apic/nmi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396cb..24e7742d633a 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -416,13 +416,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) /* We can be called before check_nmi_watchdog, hence NULL check. */ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - spin_lock(&lock); + raw_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); dump_stack(); - spin_unlock(&lock); + raw_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); rc = 1; -- cgit v1.2.2 From 5619c28061ff9d2559a93eaba492935530f2a513 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 25 Jul 2009 18:35:11 +0200 Subject: x86: Convert i8259_lock to raw_spinlock Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/i8259.c | 30 +++++++++++++++--------------- arch/x86/kernel/time.c | 4 ++-- arch/x86/kernel/visws_quirks.c | 6 +++--- 4 files changed, 22 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c86591b906fa..f5e40339622b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1830,7 +1830,7 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "\nprinting PIC contents\n"); - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); v = inb(0xa1) << 8 | inb(0x21); printk(KERN_DEBUG "... PIC IMR: %04x\n", v); @@ -1844,7 +1844,7 @@ __apicdebuginit(void) print_PIC(void) outb(0x0a,0xa0); outb(0x0a,0x20); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); printk(KERN_DEBUG "... PIC ISR: %04x\n", v); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102bef80..8c93a84bb627 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,7 +32,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); struct irq_chip i8259A_chip = { @@ -68,13 +68,13 @@ void disable_8259A_irq(unsigned int irq) unsigned int mask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void enable_8259A_irq(unsigned int irq) @@ -82,13 +82,13 @@ void enable_8259A_irq(unsigned int irq) unsigned int mask = ~(1 << irq); unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } int i8259A_irq_pending(unsigned int irq) @@ -97,12 +97,12 @@ int i8259A_irq_pending(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) ret = inb(PIC_MASTER_CMD) & mask; else ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } @@ -150,7 +150,7 @@ static void mask_and_ack_8259A(unsigned int irq) unsigned int irqmask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign @@ -183,7 +183,7 @@ handle_real_irq: outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return; spurious_8259A_irq: @@ -285,24 +285,24 @@ void mask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void unmask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void init_8259A(int auto_eoi) @@ -311,7 +311,7 @@ void init_8259A(int auto_eoi) i8259A_auto_eoi = auto_eoi; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ @@ -356,5 +356,5 @@ void init_8259A(int auto_eoi) outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index be2573448ed9..fb5cc5e14cfa 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + raw_spin_lock(&i8259A_lock); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + raw_spin_unlock(&i8259A_lock); } global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 34a279a7471d..ab38ce0984fa 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -559,7 +559,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) struct irq_desc *desc; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* Find out what's interrupting in the PIIX4 master 8259 */ outb(0x0c, 0x20); /* OCW3 Poll command */ @@ -596,7 +596,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) outb(0x60 + realirq, 0x20); } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); desc = irq_to_desc(realirq); @@ -614,7 +614,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) return IRQ_HANDLED; out_unlock: - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return IRQ_NONE; } -- cgit v1.2.2 From c13f3d378f77ce3176628ade452b0e461242faf3 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Date: Mon, 15 Feb 2010 11:33:04 +0900 Subject: x86/gart: Unexport gart_iommu_aperture I wrongly exported gart_iommu_aperture in the commit 42590a75019a50012f25a962246498dead428433. It's not necessary so let's unexport it. Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: Joerg Roedel <joerg.roedel@amd.com> LKML-Reference: <20100215113241P.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/aperture_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index f147a95fd84a..3704997e8b25 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -31,7 +31,6 @@ #include <asm/x86_init.h> int gart_iommu_aperture; -EXPORT_SYMBOL_GPL(gart_iommu_aperture); int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; -- cgit v1.2.2 From 580e0ad21d6d6f932461d24b47041e3dd499c23f Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Tue, 16 Feb 2010 18:40:35 -0800 Subject: core: Move early_res from arch/x86 to kernel/ This makes the range reservation feature available to other architectures. -v2: add get_max_mapped, max_pfn_mapped only defined in x86... to fix PPC compiling -v3: according to hpa, add CONFIG_HAVE_EARLY_RES -v4: fix typo about EARLY_RES in config Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <4B7B5723.4070009@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 10 +- arch/x86/kernel/early_res.c | 521 -------------------------------------------- 3 files changed, 10 insertions(+), 523 deletions(-) delete mode 100644 arch/x86/kernel/early_res.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f5fb9f0b6277..d87f09bc5a52 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -38,7 +38,7 @@ obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o -obj-y += bootflag.o e820.o early_res.o +obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 36918d8463ab..740b440fbd73 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -17,7 +17,6 @@ #include <linux/firmware-map.h> #include <asm/e820.h> -#include <asm/early_res.h> #include <asm/proto.h> #include <asm/setup.h> @@ -752,6 +751,15 @@ u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) { return find_e820_area(start, end, size, align); } + +u64 __init get_max_mapped(void) +{ + u64 end = max_pfn_mapped; + + end <<= PAGE_SHIFT; + + return end; +} /* * Find next free range after *start */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c deleted file mode 100644 index 1458dc022343..000000000000 --- a/arch/x86/kernel/early_res.c +++ /dev/null @@ -1,521 +0,0 @@ -/* - * early_res, could be used to replace bootmem - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/mm.h> - -#include <asm/early_res.h> - -/* - * Early reserved memory areas. - */ -/* - * need to make sure this one is bigger enough before - * find_fw_memmap_area could be used - */ -#define MAX_EARLY_RES_X 32 - -struct early_res { - u64 start, end; - char name[15]; - char overlap_ok; -}; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; - -static int max_early_res __initdata = MAX_EARLY_RES_X; -static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < max_early_res && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; - early_res_count--; -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. - */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) -{ - int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[15]; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) - continue; - - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= max_early_res) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name ? name : "", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -u64 __init __weak find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) -{ - panic("should have find_fw_memmap_area defined with arch"); - - return -1ULL; -} - -static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) -{ - u64 start, end, size, mem; - struct early_res *new; - - /* do we have enough slots left ? */ - if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) - return; - - /* double it */ - mem = -1ULL; - size = sizeof(struct early_res) * max_early_res * 2; - if (early_res == early_res_x) - start = 0; - else - start = early_res[0].end; - end = ex_start; - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - if (mem == -1ULL) { - start = ex_end; - end = max_pfn_mapped << PAGE_SHIFT; - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - } - if (mem == -1ULL) - panic("can not find more space for early_res array"); - - new = __va(mem); - /* save the first one for own */ - new[0].start = mem; - new[0].end = mem + size; - new[0].overlap_ok = 0; - /* copy old to new */ - if (early_res == early_res_x) { - memcpy(&new[1], &early_res[0], - sizeof(struct early_res) * max_early_res); - memset(&new[max_early_res+1], 0, - sizeof(struct early_res) * (max_early_res - 1)); - early_res_count++; - } else { - memcpy(&new[1], &early_res[1], - sizeof(struct early_res) * (max_early_res - 1)); - memset(&new[max_early_res], 0, - sizeof(struct early_res) * max_early_res); - } - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = new; - max_early_res *= 2; - printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", - max_early_res, mem, mem + size - 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init reserve_early_without_check(u64 start, u64 end, char *name) -{ - struct early_res *r; - - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - r = &early_res[early_res_count]; - - r->start = start; - r->end = end; - r->overlap_ok = 0; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= max_early_res || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -#ifdef CONFIG_NO_BOOTMEM -static void __init subtract_early_res(struct range *range, int az) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - -#define DEBUG_PRINT_EARLY_RES 1 - -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO "Subtract (%d early reservations)\n", count); -#endif - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, - r->start, r->end, r->name); -#endif - final_start = PFN_DOWN(r->start); - final_end = PFN_UP(r->end); - if (final_start >= final_end) - continue; - subtract_range(range, az, final_start, final_end); - } - -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - int i, count; - u64 start = 0, end; - u64 size; - u64 mem; - struct range *range; - int nr_range; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - count *= 2; - - size = sizeof(struct range) * count; -#ifdef MAX_DMA32_PFN - if (max_pfn_mapped > MAX_DMA32_PFN) - start = MAX_DMA32_PFN << PAGE_SHIFT; -#endif - end = max_pfn_mapped << PAGE_SHIFT; - mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); - if (mem == -1ULL) - panic("can not find more space for range free"); - - range = __va(mem); - /* use early_node_map[] and early_res to get range array at first */ - memset(range, 0, size); - nr_range = 0; - - /* need to go over early_node_map to find out good range for node */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); -#ifdef CONFIG_X86_32 - subtract_range(range, count, max_low_pfn, -1ULL); -#endif - subtract_early_res(range, count); - nr_range = clean_sort_range(range, count); - - /* need to clear it ? */ - if (nodeid == MAX_NUMNODES) { - memset(&early_res[0], 0, - sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - } - - *rangep = range; - return nr_range; -} -#else -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - - printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count - idx, max_early_res, start, end); - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } - /* clear them */ - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - early_res_count = 0; -} -#endif - -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < max_early_res && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) -{ - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < max_early_res && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - * only with the area.between start to end is active range from early_node_map - * so they are good as RAM - */ -u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - goto out; - if (last > end) - goto out; - - return addr; - -out: - return -1ULL; -} - -u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, - u64 *sizep, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - goto out; - - return addr; - -out: - return -1ULL; -} -- cgit v1.2.2 From 0a832320f1bae6a4169bf683e201378f2437cfc1 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" <justinmattock@gmail.com> Date: Tue, 16 Feb 2010 15:17:29 -0800 Subject: x86: Add iMac9,1 to pci_reboot_dmi_table On the iMac9,1 /sbin/reboot results in a black mangled screen. Adding this DMI entry gets the machine to reboot cleanly as it should. Signed-off-by: Justin P. Mattock <justinmattock@gmail.com> LKML-Reference: <1266362249-3337-1-git-send-email-justinmattock@gmail.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> Cc: <stable@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 704bddcdf64d..8e1aac86b50c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -461,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), }, }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, { } }; -- cgit v1.2.2 From e7b8e675d9c71b868b66f62f725a948047514719 Mon Sep 17 00:00:00 2001 From: Mike Frysinger <vapier@gentoo.org> Date: Tue, 26 Jan 2010 04:40:03 -0500 Subject: tracing: Unify arch_syscall_addr() implementations Most implementations of arch_syscall_addr() are the same, so create a default version in common code and move the one piece that differs (the syscall table) to asm/syscall.h. New arch ports don't have to waste time copying & pasting this simple function. The s390/sparc versions need to be different, so document why. Signed-off-by: Mike Frysinger <vapier@gentoo.org> Acked-by: David S. Miller <davem@davemloft.net> Acked-by: Paul Mundt <lethal@linux-sh.org> Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Steven Rostedt <rostedt@goodmis.org> LKML-Reference: <1264498803-17278-1-git-send-email-vapier@gentoo.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> --- arch/x86/kernel/ftrace.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 309689245431..0d93a941934c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -484,13 +484,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned long *sys_call_table; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)(&sys_call_table)[nr]; -} -#endif -- cgit v1.2.2 From 6738762d73a237ec322b04d8b9d55c8fd5d84713 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:36 -0800 Subject: x86, irq: Remove arch_probe_nr_irqs So keep nr_irqs == NR_IRQS. With radix trees is matters less. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-33-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f5e40339622b..c64ddd9d9979 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3826,28 +3826,6 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } -#ifdef CONFIG_SPARSE_IRQ -int __init arch_probe_nr_irqs(void) -{ - int nr; - - if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) - nr_irqs = NR_VECTORS * nr_cpu_ids; - - nr = nr_irqs_gsi + 8 * nr_cpu_ids; -#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) - /* - * for MSI and HT dyn irq - */ - nr += nr_irqs_gsi * 16; -#endif - if (nr < nr_irqs) - nr_irqs = nr; - - return 0; -} -#endif - static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { -- cgit v1.2.2 From 2b633e3fac5efada088b57d31e65401f22bcc18f Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 10 Feb 2010 01:20:37 -0800 Subject: smp: Use nr_cpus= to set nr_cpu_ids early On x86, before prefill_possible_map(), nr_cpu_ids will be NR_CPUS aka CONFIG_NR_CPUS. Add nr_cpus= to set nr_cpu_ids. so we can simulate cpus <=8 are installed on normal config. -v2: accordging to Christoph, acpi_numa_init should use nr_cpu_ids in stead of NR_CPUS. -v3: add doc in kernel-parameters.txt according to Andrew. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-34-git-send-email-yinghai@kernel.org> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> Cc: Tony Luck <tony.luck@intel.com> --- arch/x86/kernel/smpboot.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..eff2fe175422 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1213,11 +1213,12 @@ __init void prefill_possible_map(void) total_cpus = max_t(int, possible, num_processors + disabled_cpus); - if (possible > CONFIG_NR_CPUS) { + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { printk(KERN_WARNING "%d Processors exceeds NR_CPUS limit of %d\n", - possible, CONFIG_NR_CPUS); - possible = CONFIG_NR_CPUS; + possible, nr_cpu_ids); + possible = nr_cpu_ids; } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", -- cgit v1.2.2 From eb5b3794062824ba12d883901eea49ea89d0a678 Mon Sep 17 00:00:00 2001 From: Brandon Philips <bphilips@suse.de> Date: Sun, 7 Feb 2010 13:02:50 -0800 Subject: x86, irq: Keep chip_data in create_irq_nr and destroy_irq Version 4: use get_irq_chip_data() in destroy_irq() to get rid of some local vars. When two drivers are setting up MSI-X at the same time via pci_enable_msix() there is a race. See this dmesg excerpt: [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X [ 85.170611] alloc irq_desc for 99 on node -1 [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X [ 85.170614] alloc kstat_irqs on node -1 [ 85.170616] alloc irq_2_iommu on node -1 [ 85.170617] alloc irq_desc for 100 on node -1 [ 85.170619] alloc kstat_irqs on node -1 [ 85.170621] alloc irq_2_iommu on node -1 [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X [ 85.170626] alloc irq_desc for 101 on node -1 [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X [ 85.170630] alloc kstat_irqs on node -1 [ 85.170631] alloc irq_2_iommu on node -1 [ 85.170635] alloc irq_desc for 102 on node -1 [ 85.170636] alloc kstat_irqs on node -1 [ 85.170639] alloc irq_2_iommu on node -1 [ 85.170646] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 As you can see igb and ixgbe are both alternating on create_irq_nr() via pci_enable_msix() in their probe function. ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data = NULL via dynamic_irq_init(). igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[] via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this: cfg_new = irq_desc_ptrs[102]->chip_data; if (cfg_new->vector != 0) continue; This hits the NULL deref. Another possible race exists via pci_disable_msix() in a driver or in the number of error paths that call free_msi_irqs(): destroy_irq() dynamic_irq_cleanup() which sets desc->chip_data = NULL ...race window... desc->chip_data = cfg; Remove the save and restore code for cfg in create_irq_nr() and destroy_irq() and take the desc->lock when checking the irq_cfg. Reported-and-analyzed-by: Brandon Philips <bphilips@suse.de> Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <20100207210250.GB8256@jenkins.home.ifup.org> Signed-off-by: Brandon Phiilps <bphilips@suse.de> Cc: stable@kernel.org Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5e4cce254e43..e93a76bc8670 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3278,12 +3278,9 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) { - dynamic_irq_init(irq); - /* restore it, in case dynamic_irq_init clear it */ - if (desc_new) - desc_new->chip_data = cfg_new; - } + if (irq > 0) + dynamic_irq_init_keep_chip_data(irq); + return irq; } @@ -3305,19 +3302,12 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; - struct irq_cfg *cfg; - struct irq_desc *desc; - /* store it, in case dynamic_irq_cleanup clear it */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - dynamic_irq_cleanup(irq); - /* connect back irq_cfg */ - desc->chip_data = cfg; + dynamic_irq_cleanup_keep_chip_data(irq); free_irte(irq); spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq, cfg); + __clear_irq_vector(irq, get_irq_chip_data(irq)); spin_unlock_irqrestore(&vector_lock, flags); } -- cgit v1.2.2 From f619b3d8427eb57f0134dab75b0d217325c72411 Mon Sep 17 00:00:00 2001 From: Borislav Petkov <borislav.petkov@amd.com> Date: Thu, 4 Feb 2010 12:09:07 +0100 Subject: x86, cacheinfo: Remove NUMA dependency, fix for AMD Fam10h rev D1 The show/store_cache_disable routines depend unnecessarily on NUMA's cpu_to_node and the disabling of cache indices broke when !CONFIG_NUMA. Remove that dependency by using a helper which is always correct. While at it, enable L3 Cache Index disable on rev D1 Istanbuls which sport the feature too. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> LKML-Reference: <20100218184339.GG20473@aftab> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 589b705e80ed..be5f5c28ddfb 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -327,7 +327,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) /* see errata #382 and #388 */ if ((boot_cpu_data.x86 == 0x10) && - ((boot_cpu_data.x86_model < 0x9) || + ((boot_cpu_data.x86_model < 0x8) || (boot_cpu_data.x86_mask < 0x1))) return; @@ -744,7 +744,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, unsigned int index) { int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); + int node = amd_get_nb_id(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned int reg = 0; @@ -771,7 +771,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, size_t count, unsigned int index) { int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); + int node = amd_get_nb_id(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; -- cgit v1.2.2 From cb19060abfdecac0d1eb2d2f0e7d6b7a3f8bc4f4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov <borislav.petkov@amd.com> Date: Thu, 18 Feb 2010 19:37:14 +0100 Subject: x86, cacheinfo: Enable L3 CID only on AMD Final stage linking can fail with arch/x86/built-in.o: In function `store_cache_disable': intel_cacheinfo.c:(.text+0xc509): undefined reference to `amd_get_nb_id' arch/x86/built-in.o: In function `show_cache_disable': intel_cacheinfo.c:(.text+0xc7d3): undefined reference to `amd_get_nb_id' when CONFIG_CPU_SUP_AMD is not enabled because the amd_get_nb_id helper is defined in AMD-specific code but also used in generic code (intel_cacheinfo.c). Reorganize the L3 cache index disable code under CONFIG_CPU_SUP_AMD since it is AMD-only anyway. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> LKML-Reference: <20100218184210.GF20473@aftab> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/intel_cacheinfo.c | 186 ++++++++++++++++++---------------- 1 file changed, 98 insertions(+), 88 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index be5f5c28ddfb..d440123c556f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -293,6 +293,13 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); +}; + +#ifdef CONFIG_CPU_SUP_AMD static unsigned int __cpuinit amd_calc_l3_indices(void) { /* @@ -303,7 +310,7 @@ static unsigned int __cpuinit amd_calc_l3_indices(void) int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned int sc0, sc1, sc2, sc3; - u32 val; + u32 val = 0; pci_read_config_dword(dev, 0x1C4, &val); @@ -335,6 +342,94 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) this_leaf->l3_indices = amd_calc_l3_indices(); } +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!dev) + return -EINVAL; + + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "0x%08x\n", reg); +} + +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) + return -EINVAL; + + if (strict_strtoul(buf, 10, &val) < 0) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) + return -EINVAL; + + val |= BIT(30); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); + return count; +} + +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); + +#else /* CONFIG_CPU_SUP_AMD */ +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +{ +}; +#endif /* CONFIG_CPU_SUP_AMD */ + static int __cpuinit cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) @@ -740,88 +835,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, - unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = amd_get_nb_id(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned int reg = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!dev) - return -EINVAL; - - pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "0x%08x\n", reg); -} - -#define SHOW_CACHE_DISABLE(index) \ -static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ -{ \ - return show_cache_disable(this_leaf, buf, index); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = amd_get_nb_id(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned long val = 0; - -#define SUBCACHE_MASK (3UL << 20) -#define SUBCACHE_INDEX 0xfff - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; - - /* do not allow writes outside of allowed bits */ - if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) - return -EINVAL; - - val |= BIT(30); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - /* - * We need to WBINVD on a core on the node containing the L3 cache which - * indices we disable therefore a simple wbinvd() is not sufficient. - */ - wbinvd_on_cpu(cpu); - pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); - return count; -} - -#define STORE_CACHE_DISABLE(index) \ -static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ -{ \ - return store_cache_disable(this_leaf, buf, count, index); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); -}; - #define define_one_ro(_name) \ static struct _cache_attr _name = \ __ATTR(_name, 0444, show_##_name, NULL) @@ -836,11 +849,6 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); - #define DEFAULT_SYSFS_CACHE_ATTRS \ &type.attr, \ &level.attr, \ @@ -859,8 +867,10 @@ static struct attribute *default_attrs[] = { static struct attribute *default_l3_attrs[] = { DEFAULT_SYSFS_CACHE_ATTRS, +#ifdef CONFIG_CPU_SUP_AMD &cache_disable_0.attr, &cache_disable_1.attr, +#endif NULL }; -- cgit v1.2.2 From 84d710926797a6e317e7e94654a3ccd771cfd8a3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Thu, 18 Feb 2010 16:00:59 +0100 Subject: hw-breakpoints: Accept breakpoints on NULL address Before we had a generic breakpoint API, ptrace was accepting breakpoints on NULL address in x86. The new API refuse them, without given strong reasons. We need to follow the previous behaviour as some userspace apps like Wine need such NULL breakpoints to ensure old emulated software protections are still working. This fixes a 2.6.32 - 2.6.33-x ptrace regression. Reported-and-tested-by: Michael Stefaniuc <mstefani@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Acked-by: K.Prasad <prasad@linux.vnet.ibm.com> Acked-by: Roland McGrath <roland@redhat.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: Maneesh Soni <maneesh@linux.vnet.ibm.com> Cc: Alexandre Julliard <julliard@winehq.org> Cc: Rafael J. Wysocki <rjw@sisk.pl> Cc: Maciej Rutecki <maciej.rutecki@gmail.com> --- arch/x86/kernel/hw_breakpoint.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 05d5fec64a94..bb6006e3e295 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } -/* - * Store a breakpoint's encoded address, length, and type. - */ -static int arch_store_info(struct perf_event *bp) -{ - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); - if (info->address) - return 0; - - return -EINVAL; -} - int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { @@ -362,10 +343,13 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - ret = arch_store_info(bp); - - if (ret < 0) - return ret; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. -- cgit v1.2.2 From 326264a02448b0ac51f78f178b78e830aa077a0b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Thu, 18 Feb 2010 18:24:18 +0100 Subject: hw-breakpoint: Keep track of dr7 local enable bits When the user enables breakpoints through dr7, he can choose between "local" or "global" enable bits but given how linux is implemented, both have the same effect. That said we don't keep track how the user enabled the breakpoints so when the user requests the dr7 value, we only translate the "enabled" status using the global enabled bits. It means that if the user enabled a breakpoint using the local enabled bit, reading back dr7 will set the global bit and clear the local one. Apps like Wine expect a full dr7 POKEUSER/PEEKUSER match for emulated softwares that implement old reverse engineering protection schemes. We fix that by keeping track of the whole dr7 value given by the user in the thread structure to drop this bug. We'll think about something more proper later. This fixes a 2.6.32 - 2.6.33-x ptrace regression. Reported-and-tested-by: Michael Stefaniuc <mstefani@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Acked-by: K.Prasad <prasad@linux.vnet.ibm.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: Maneesh Soni <maneesh@linux.vnet.ibm.com> Cc: Alexandre Julliard <julliard@winehq.org> Cc: Rafael J. Wysocki <rjw@sisk.pl> Cc: Maciej Rutecki <maciej.rutecki@gmail.com> --- arch/x86/kernel/ptrace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..0c1033d61e59 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -702,7 +702,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { - val = ptrace_get_dr7(thread->ptrace_bps); + val = thread->ptrace_dr7; } return val; } @@ -778,8 +778,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) return rc; } /* All that's left is DR7 */ - if (n == 7) + if (n == 7) { rc = ptrace_write_dr7(tsk, val); + if (!rc) + thread->ptrace_dr7 = val; + } ret_path: return rc; -- cgit v1.2.2 From b72d0db9dd41da1f2ec6274b03e8909583c64e41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 29 Aug 2009 16:24:51 +0200 Subject: x86: Move pci init function to x86_init The PCI initialization in pci_subsys_init() is a mess. pci_numaq_init, pci_acpi_init, pci_visws_init and pci_legacy_init are called and each implementation checks and eventually modifies the global variable pcibios_scanned. x86_init functions allow us to do this more elegant. The pci.init function pointer is preset to pci_legacy_init. numaq, acpi and visws can modify the pointer in their early setup functions. The functions return 0 when they did the full initialization including bus scan. A non zero return value indicates that pci_legacy_init needs to be called either because the selected function failed or wants the generic bus scan in pci_legacy_init to happen (e.g. visws). Signed-off-by: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80CFE@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/acpi/boot.c | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 1 + arch/x86/kernel/visws_quirks.c | 6 +----- arch/x86/kernel/x86_init.c | 5 +++++ 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdfa5ca4..054a5f5548b0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@ #include <linux/ioport.h> #include <linux/pci.h> +#include <asm/pci_x86.h> #include <asm/pgtable.h> #include <asm/io_apic.h> #include <asm/apic.h> @@ -1604,6 +1605,9 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (!acpi_noirq) + x86_init.pci.init = pci_acpi_init; + return 0; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 98c4665f251c..be5c4fd47778 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -277,6 +277,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; x86_init.timers.tsc_pre_init = numaq_tsc_init; + x86_init.pci.init = pci_numaq_init; } } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 34a279a7471d..843e9d30a1e3 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -49,11 +49,6 @@ extern int no_broadcast; char visws_board_type = -1; char visws_board_rev = -1; -int is_visws_box(void) -{ - return visws_board_type >= 0; -} - static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -242,6 +237,7 @@ void __init visws_early_detect(void) x86_init.irqs.pre_vector_init = visws_pre_intr_init; x86_init.irqs.trap_init = visws_trap_init; x86_init.timers.timer_init = visws_time_init; + x86_init.pci.init = pci_visws_init; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ccd179dec36e..81faa6d67d69 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -7,6 +7,7 @@ #include <asm/bios_ebda.h> #include <asm/paravirt.h> +#include <asm/pci_x86.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> @@ -70,6 +71,10 @@ struct x86_init_ops x86_init __initdata = { .iommu = { .iommu_init = iommu_init_noop, }, + + .pci = { + .init = x86_default_pci_init, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { -- cgit v1.2.2 From ab3b37937e8f4fb38dc9780b7bc3fd3c5195cca3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 29 Aug 2009 17:47:33 +0200 Subject: x86: Add pci_init_irq to x86_init Moorestown wants to reuse pcibios_init_irq but needs to provide its own implementation of pci_enable_irq. After we distangled the init we can move the init_irq call to x86_init and remove the pci_enable_irq != NULL check in pcibios_init_irq. pci_enable_irq is compile time initialized to pirq_enable_irq and the special cases which override it (visws and acpi) set the x86_init function pointer to noop. That allows MSRT to override pci_enable_irq and otherwise run pcibios_init_irq unmodified. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80CFF@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/visws_quirks.c | 1 + arch/x86/kernel/x86_init.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 843e9d30a1e3..b48ef6c0d716 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -238,6 +238,7 @@ void __init visws_early_detect(void) x86_init.irqs.trap_init = visws_trap_init; x86_init.timers.timer_init = visws_time_init; x86_init.pci.init = pci_visws_init; + x86_init.pci.init_irq = x86_init_noop; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 81faa6d67d69..203f26fb7f33 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -74,6 +74,7 @@ struct x86_init_ops x86_init __initdata = { .pci = { .init = x86_default_pci_init, + .init_irq = x86_default_pci_init_irq, }, }; -- cgit v1.2.2 From 9325a28ce2fa7c597e5ed41455a06c30b82b5710 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 29 Aug 2009 17:51:26 +0200 Subject: x86: Add pcibios_fixup_irqs to x86_init Platforms like Moorestown want to override the pcibios_fixup_irqs default function. Add it to x86_init.pci. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D00@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/x86_init.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 203f26fb7f33..1817cd7a03fa 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -4,6 +4,7 @@ * For licencing details see kernel-base/COPYING */ #include <linux/init.h> +#include <linux/ioport.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> @@ -75,6 +76,7 @@ struct x86_init_ops x86_init __initdata = { .pci = { .init = x86_default_pci_init, .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, }, }; -- cgit v1.2.2 From d39f6495f66616b637260405d0b5dc2656bc490e Mon Sep 17 00:00:00 2001 From: Alek Du <alek.du@intel.com> Date: Mon, 7 Sep 2009 16:25:45 +0800 Subject: x86, ioapic: Improve handling of i8259A irq init Since we already track the number of legacy vectors by nr_legacy_irqs, we can avoid use static vector allocations -- we can use dynamic one. Signed-off-by: Alek Du <alek.du@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D01@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816d..75265ab83b17 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -140,27 +140,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg irq_cfgx[] = { +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; #else -static struct irq_cfg irq_cfgx[NR_IRQS] = { +static struct irq_cfg irq_cfgx[NR_IRQS]; #endif - [0] = { .vector = IRQ0_VECTOR, }, - [1] = { .vector = IRQ1_VECTOR, }, - [2] = { .vector = IRQ2_VECTOR, }, - [3] = { .vector = IRQ3_VECTOR, }, - [4] = { .vector = IRQ4_VECTOR, }, - [5] = { .vector = IRQ5_VECTOR, }, - [6] = { .vector = IRQ6_VECTOR, }, - [7] = { .vector = IRQ7_VECTOR, }, - [8] = { .vector = IRQ8_VECTOR, }, - [9] = { .vector = IRQ9_VECTOR, }, - [10] = { .vector = IRQ10_VECTOR, }, - [11] = { .vector = IRQ11_VECTOR, }, - [12] = { .vector = IRQ12_VECTOR, }, - [13] = { .vector = IRQ13_VECTOR, }, - [14] = { .vector = IRQ14_VECTOR, }, - [15] = { .vector = IRQ15_VECTOR, }, -}; void __init io_apic_disable_legacy(void) { @@ -181,6 +164,8 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { + if (i < nr_legacy_irqs) + cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); -- cgit v1.2.2 From 35f720c5930f689647d51ad77e2a8d6f0abf66c8 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Thu, 17 Sep 2009 07:36:43 -0700 Subject: x86: Initialize stack canary in secondary start Some secondary clockevent setup code needs to call request_irq, which will cause fake stack check failure in schedule() if voluntary preemption model is chosen. It is safe to have stack canary initialized here early, since start_secondary() does not return. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D02@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/smpboot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b4e870cbdc60..3e6150d421e4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -48,6 +48,7 @@ #include <linux/err.h> #include <linux/nmi.h> #include <linux/tboot.h> +#include <linux/stackprotector.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -324,6 +325,9 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); + /* to prevent fake stack check failure in clock setup */ + boot_init_stack_canary(); + x86_cpuinit.setup_percpu_clockev(); wmb(); -- cgit v1.2.2 From ef3548668c02cc8c3922f4423f32b53e662811c6 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Mon, 9 Nov 2009 11:24:14 -0800 Subject: x86, pic: Introduce legacy_pic abstraction This patch makes i8259A like legacy programmable interrupt controller code into a driver so that legacy pic functions can be selected at runtime based on platform information, such as HW subarchitecure ID. Default structure of legacy_pic maintains the current code path for x86pc. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D03@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/i8259.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102bef80..b80987ca33ea 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -358,3 +358,46 @@ void init_8259A(int auto_eoi) spin_unlock_irqrestore(&i8259A_lock, flags); } +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void __init legacy_pic_noop(void) { }; +static void __init legacy_pic_uint_noop(unsigned int unused) { }; +static void __init legacy_pic_int_noop(int unused) { }; + +static struct irq_chip dummy_pic_chip = { + .name = "dummy pic", + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, + .disable = legacy_pic_uint_noop, + .mask_ack = legacy_pic_uint_noop, +}; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ + return 0; +} + +struct legacy_pic null_legacy_pic = { + .nr_legacy_irqs = 0, + .chip = &dummy_pic_chip, + .mask_all = legacy_pic_noop, + .restore_mask = legacy_pic_noop, + .init = legacy_pic_int_noop, + .irq_pending = legacy_pic_irq_pending_noop, + .make_irq = legacy_pic_uint_noop, +}; + +struct legacy_pic default_legacy_pic = { + .nr_legacy_irqs = NR_IRQS_LEGACY, + .chip = &i8259A_chip, + .mask_all = mask_8259A, + .restore_mask = unmask_8259A, + .init = init_8259A, + .irq_pending = i8259A_irq_pending, + .make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; -- cgit v1.2.2 From b81bb373a7e832a43921356aa1291044d7f52fb1 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Mon, 9 Nov 2009 11:27:04 -0800 Subject: x86, pic: Make use of legacy_pic abstraction This patch replaces legacy PIC-related global variable and functions with the new legacy_pic abstraction. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D04@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/apic.c | 8 +++--- arch/x86/kernel/apic/io_apic.c | 57 ++++++++++++++++++++---------------------- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/i8259.c | 21 ++++++++++------ arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/smpboot.c | 5 ++-- arch/x86/kernel/visws_quirks.c | 14 +++++++---- 7 files changed, 59 insertions(+), 50 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dfca210f6a10..94f22b12858d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1390,7 +1390,7 @@ void __init enable_IR_x2apic(void) } local_irq_save(flags); - mask_8259A(); + legacy_pic->mask_all(); mask_IO_APIC_setup(ioapic_entries); if (dmar_table_init_ret) @@ -1422,7 +1422,7 @@ void __init enable_IR_x2apic(void) nox2apic: if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); + legacy_pic->restore_mask(); local_irq_restore(flags); out: @@ -2018,7 +2018,7 @@ static int lapic_resume(struct sys_device *dev) } mask_IO_APIC_setup(ioapic_entries); - mask_8259A(); + legacy_pic->mask_all(); } if (x2apic_mode) @@ -2062,7 +2062,7 @@ static int lapic_resume(struct sys_device *dev) if (intr_remapping_enabled) { reenable_intr_remapping(x2apic_mode); - unmask_8259A(); + legacy_pic->restore_mask(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 75265ab83b17..1704cd82db5f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -94,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* Number of legacy interrupts */ -static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; @@ -147,7 +145,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS]; void __init io_apic_disable_legacy(void) { - nr_legacy_irqs = 0; nr_irqs_gsi = 0; } @@ -164,13 +161,13 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { - if (i < nr_legacy_irqs) + if (i < legacy_pic->nr_legacy_irqs) cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < nr_legacy_irqs) + if (i < legacy_pic->nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -850,7 +847,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < nr_legacy_irqs) { + if (irq < legacy_pic->nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1446,8 +1443,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < nr_legacy_irqs) - disable_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->chip->mask(irq); ioapic_write_entry(apic_id, pin, entry); } @@ -1810,7 +1807,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1894,7 +1891,7 @@ void __init enable_IO_APIC(void) nr_ioapic_registers[apic] = reg_01.bits.entries+1; } - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; for(apic = 0; apic < nr_ioapics; apic++) { @@ -1951,7 +1948,7 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; /* @@ -2184,9 +2181,9 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < nr_legacy_irqs) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) + if (irq < legacy_pic->nr_legacy_irqs) { + legacy_pic->chip->mask(irq); + if (legacy_pic->irq_pending(irq)) was_pending = 1; } cfg = irq_cfg(irq); @@ -2719,8 +2716,8 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < nr_legacy_irqs) - make_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; @@ -2877,7 +2874,7 @@ static inline void __init check_timer(void) /* * get/set the timer IRQ vector: */ - disable_8259A_irq(0); + legacy_pic->chip->mask(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -2890,7 +2887,7 @@ static inline void __init check_timer(void) * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); + legacy_pic->init(1); #ifdef CONFIG_X86_32 { unsigned int ver; @@ -2949,7 +2946,7 @@ static inline void __init check_timer(void) if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2972,14 +2969,14 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } goto out; } @@ -2987,7 +2984,7 @@ static inline void __init check_timer(void) * Cleanup, just in case ... */ local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -3006,22 +3003,22 @@ static inline void __init check_timer(void) lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as ExtINT IRQ...\n"); - init_8259A(0); - make_8259A_irq(0); + legacy_pic->init(0); + legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); @@ -3063,7 +3060,7 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* @@ -3074,7 +3071,7 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - if (nr_legacy_irqs) + if (legacy_pic->nr_legacy_irqs) check_timer(); } @@ -3875,7 +3872,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= nr_legacy_irqs) { + if (irq >= legacy_pic->nr_legacy_irqs) { cfg = desc->chip_data; if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396cb..3817739acee9 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -177,7 +177,7 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC) { if (!timer_through_8259) - disable_8259A_irq(0); + legacy_pic->chip->mask(0); on_each_cpu(__acpi_nmi_disable, NULL, 1); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index b80987ca33ea..1c790e75f7a0 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -34,6 +34,12 @@ static int i8259A_auto_eoi; DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +static void mask_8259A(void); +static void unmask_8259A(void); +static void disable_8259A_irq(unsigned int irq); +static void enable_8259A_irq(unsigned int irq); +static void init_8259A(int auto_eoi); +static int i8259A_irq_pending(unsigned int irq); struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -63,7 +69,7 @@ unsigned int cached_irq_mask = 0xffff; */ unsigned long io_apic_irqs; -void disable_8259A_irq(unsigned int irq) +static void disable_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; @@ -77,7 +83,7 @@ void disable_8259A_irq(unsigned int irq) spin_unlock_irqrestore(&i8259A_lock, flags); } -void enable_8259A_irq(unsigned int irq) +static void enable_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; @@ -91,7 +97,7 @@ void enable_8259A_irq(unsigned int irq) spin_unlock_irqrestore(&i8259A_lock, flags); } -int i8259A_irq_pending(unsigned int irq) +static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<<irq; unsigned long flags; @@ -107,7 +113,7 @@ int i8259A_irq_pending(unsigned int irq) return ret; } -void make_8259A_irq(unsigned int irq) +static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); @@ -281,7 +287,7 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); -void mask_8259A(void) +static void mask_8259A(void) { unsigned long flags; @@ -293,7 +299,7 @@ void mask_8259A(void) spin_unlock_irqrestore(&i8259A_lock, flags); } -void unmask_8259A(void) +static void unmask_8259A(void) { unsigned long flags; @@ -305,7 +311,7 @@ void unmask_8259A(void) spin_unlock_irqrestore(&i8259A_lock, flags); } -void init_8259A(int auto_eoi) +static void init_8259A(int auto_eoi) { unsigned long flags; @@ -358,6 +364,7 @@ void init_8259A(int auto_eoi) spin_unlock_irqrestore(&i8259A_lock, flags); } + /* * make i8259 a driver so that we can select pic functions at run time. the goal * is to make x86 binary compatible among pc compatible and non-pc compatible diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d5932226614f..89b9510e8030 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -123,7 +123,7 @@ void __init init_ISA_irqs(void) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); #endif - init_8259A(0); + legacy_pic->init(0); /* * 16 old-style INTA-cycle interrupts: diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3e6150d421e4..f7a52f4a21a5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,6 +68,7 @@ #include <linux/mc146818rtc.h> #include <asm/smpboot_hooks.h> +#include <asm/i8259.h> #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; @@ -287,9 +288,9 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); enable_NMI_through_LVT0(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index b48ef6c0d716..f067e9556a47 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -505,7 +505,7 @@ static struct irq_chip cobalt_irq_type = { */ static unsigned int startup_piix4_master_irq(unsigned int irq) { - init_8259A(0); + legacy_pic->init(0); return startup_cobalt_irq(irq); } @@ -529,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { .name = "PIIX4-virtual", - .shutdown = disable_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, }; @@ -606,7 +603,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) handle_IRQ_event(realirq, desc->action); if (!(desc->status & IRQ_DISABLED)) - enable_8259A_irq(realirq); + legacy_pic->chip->unmask(realirq); return IRQ_HANDLED; @@ -625,6 +622,12 @@ static struct irqaction cascade_action = { .name = "cascade", }; +static inline void set_piix4_virtual_irq_type(void) +{ + piix4_virtual_irq_type.shutdown = i8259A_chip.mask; + piix4_virtual_irq_type.enable = i8259A_chip.unmask; + piix4_virtual_irq_type.disable = i8259A_chip.mask; +} void init_VISWS_APIC_irqs(void) { @@ -650,6 +653,7 @@ void init_VISWS_APIC_irqs(void) desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { + set_piix4_virtual_irq_type(); desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { -- cgit v1.2.2 From 1f91233c26fd5f7d6525fd29b95e4b50ca7a3e88 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Fri, 5 Feb 2010 04:06:56 -0800 Subject: x86, apic: Remove ioapic_disable_legacy() The ioapic_disable_legacy() call is no longer needed for platforms do not have legacy pic. the legacy pic abstraction has taken care it automatically. This patch also initialize irq-related static variables based on information obtained from legacy_pic. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F0755A30A7660@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1704cd82db5f..3592a72f3f0a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -143,11 +143,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; static struct irq_cfg irq_cfgx[NR_IRQS]; #endif -void __init io_apic_disable_legacy(void) -{ - nr_irqs_gsi = 0; -} - int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -156,6 +151,11 @@ int __init arch_early_irq_init(void) int node; int i; + if (!legacy_pic->nr_legacy_irqs) { + nr_irqs_gsi = 0; + io_apic_irqs = ~0UL; + } + cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); node= cpu_to_node(boot_cpu_id); -- cgit v1.2.2 From ff7fbc72e0c3ef7e94a27a3a918fd09ec9a30204 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Mon, 22 Feb 2010 14:51:33 -0800 Subject: x86, ptrace: Simplify xstateregs_get() 48 bytes (bytes 464..511) of the xstateregs payload come from the kernel defined structure (xstate_fx_sw_bytes). Rest comes from the xstate regs structure in the thread struct. Instead of having multiple user_regset_copyout()'s, simplify the xstateregs_get() by first copying the SW bytes into the xstate regs structure in the thread structure and then using one user_regset_copyout() to copyout the xstateregs. Requested-by: Roland McGrath <roland@redhat.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <20100222225240.494688491@sbs-t61.sc.intel.com> Acked-by: Roland McGrath <roland@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> Cc: Oleg Nesterov <oleg@redhat.com> --- arch/x86/kernel/i387.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 7a8a193b5144..81e23bf12c12 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -243,34 +243,18 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, return ret; /* - * First copy the fxsave bytes 0..463. + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->xsave, 0, - offsetof(struct user_xstateregs, - i387.xstate_fx_sw)); - if (ret) - return ret; - - /* - * Copy the 48bytes defined by software. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - xstate_fx_sw_bytes, - offsetof(struct user_xstateregs, - i387.xstate_fx_sw), - offsetof(struct user_xstateregs, - xsave_hdr)); - if (ret) - return ret; + memcpy(&target->thread.xstate->fxsave.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); /* - * Copy the rest of xstate memory layout. + * Copy the xstate memory layout. */ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->xsave.xsave_hdr, - offsetof(struct user_xstateregs, - xsave_hdr), -1); + &target->thread.xstate->xsave, 0, -1); return ret; } -- cgit v1.2.2 From 6dbbe14f21368a45aedba7eab0221857b8ad8d16 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Mon, 22 Feb 2010 14:51:34 -0800 Subject: x86, ptrace: Remove set_stopped_child_used_math() in [x]fpregs_set init_fpu() already ensures that the used_math() is set for the stopped child. Remove the redundant set_stopped_child_used_math() in [x]fpregs_set() Reported-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> LKML-Reference: <20100222225240.642169080@sbs-t61.sc.intel.com> Acked-by: Rolan McGrath <roland@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/i387.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 81e23bf12c12..c01a2b846d47 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -209,8 +209,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fxsave, 0, -1); @@ -471,8 +469,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); -- cgit v1.2.2 From 28a3c93d11212655ce0a9be977c405c703844164 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Tue, 23 Feb 2010 02:03:31 -0800 Subject: x86, pic: Fix section mismatch in legacy pic Move legacy_pic chip dummy functions out of init section as they might be referenced at run time. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D3AA@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/i8259.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 9bac6817456f..fb725ee15f55 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -371,9 +371,9 @@ static void init_8259A(int auto_eoi) * platforms, such as x86 MID. */ -static void __init legacy_pic_noop(void) { }; -static void __init legacy_pic_uint_noop(unsigned int unused) { }; -static void __init legacy_pic_int_noop(int unused) { }; +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; static struct irq_chip dummy_pic_chip = { .name = "dummy pic", -- cgit v1.2.2 From 05ddafb17ad1a73c8bc333cb328bad46513e85e7 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Wed, 23 Sep 2009 07:20:23 -0700 Subject: x86, ioapic: Early enable ioapic for timer irq Moorestown platform needs apic ready early for the system timer irq which is delievered via ioapic. Should not impact other platforms. In the longer term, once ioapic setup is moved before late time init, we will not need this patch to do early apic enabling. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D07@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b34854358ee6..8c848b5877a0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4289,3 +4289,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } + +/* Enable IOAPIC early just for system timer */ +void __init pre_init_apic_IRQ0(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + + printk(KERN_INFO "Early APIC setup for system timer0\n"); +#ifndef CONFIG_SMP + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); +#endif + desc = irq_to_desc_alloc_node(0, 0); + + setup_local_APIC(); + + cfg = irq_cfg(0); + add_pin_to_irq_node(cfg, 0, 0, 0); + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); +} -- cgit v1.2.2 From 5b78b6724a405d4b649db53f7aac28c930c89640 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Fri, 12 Feb 2010 02:29:11 -0800 Subject: x86, mrst: Add dummy legacy pic to platform setup Moorestown has no legacy PIC; point it to the null legacy PIC. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D09@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/mrst.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 3b7078abc871..bdf9b17290c6 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -12,6 +12,9 @@ #include <linux/init.h> #include <asm/setup.h> +#include <asm/mrst.h> +#include <asm/io.h> +#include <asm/i8259.h> /* * Moorestown specific x86_init function overrides and early setup @@ -21,4 +24,6 @@ void __init x86_mrst_early_setup(void) { x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + + legacy_pic = &null_legacy_pic; } -- cgit v1.2.2 From af2730f6eefce24c4ef1dc3f8267d33626db81bc Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Fri, 12 Feb 2010 10:31:47 -0800 Subject: x86, mrst: Fill in PCI functions in x86_init layer This patch added Moorestown platform specific PCI init functions. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0A@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/mrst.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index bdf9b17290c6..98440e18919b 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -25,5 +25,8 @@ void __init x86_mrst_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + x86_init.pci.init = pci_mrst_init; + x86_init.pci.fixup_irqs = x86_init_noop; + legacy_pic = &null_legacy_pic; } -- cgit v1.2.2 From 16ab5395856d8953ae3d81e81bd6a8c269a1bfd6 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Fri, 12 Feb 2010 03:08:30 -0800 Subject: x86, mrst: Add platform timer info parsing code Moorestown platform timer information is obtained from SFI FW tables. This patch parses SFI table then assign the irq information to mp_irqs. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0B@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/mrst.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 98440e18919b..bb6e45c71dde 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -10,12 +10,122 @@ * of the License. */ #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sfi.h> +#include <linux/irq.h> #include <asm/setup.h> +#include <asm/mpspec_def.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/mrst.h> #include <asm/io.h> #include <asm/i8259.h> +static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; +static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +int sfi_mtimer_num; + +static inline void assign_to_mp_irq(struct mpc_intsrc *m, + struct mpc_intsrc *mp_irq) +{ + memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static void save_mp_irq(struct mpc_intsrc *m) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +/* parse all the mtimer info to a static mtimer array */ +static int __init sfi_parse_mtmr(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_timer_table_entry *pentry; + struct mpc_intsrc mp_irq; + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mtimer_num) { + sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_timer_table_entry); + pentry = (struct sfi_timer_table_entry *) sb->pentry; + totallen = sfi_mtimer_num * sizeof(*pentry); + memcpy(sfi_mtimer_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); + pentry = sfi_mtimer_array; + for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { + printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," + " irq = %d\n", totallen, (u32)pentry->phys_addr, + pentry->freq_hz, pentry->irq); + if (!pentry->irq) + continue; + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; +/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + + return 0; +} + +struct sfi_timer_table_entry *sfi_get_mtmr(int hint) +{ + int i; + if (hint < sfi_mtimer_num) { + if (!sfi_mtimer_usage[hint]) { + pr_debug("hint taken for timer %d irq %d\n",\ + hint, sfi_mtimer_array[hint].irq); + sfi_mtimer_usage[hint] = 1; + return &sfi_mtimer_array[hint]; + } + } + /* take the first timer available */ + for (i = 0; i < sfi_mtimer_num;) { + if (!sfi_mtimer_usage[i]) { + sfi_mtimer_usage[i] = 1; + return &sfi_mtimer_array[i]; + } + i++; + } + return NULL; +} + +void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) +{ + int i; + for (i = 0; i < sfi_mtimer_num;) { + if (mtmr->irq == sfi_mtimer_array[i].irq) { + sfi_mtimer_usage[i] = 0; + return; + } + i++; + } +} + /* * Moorestown specific x86_init function overrides and early setup * calls. -- cgit v1.2.2 From cf089455966e21aeb8e4cd2669e0c1885667b04e Mon Sep 17 00:00:00 2001 From: Feng Tang <feng.tang@intel.com> Date: Fri, 12 Feb 2010 03:37:38 -0800 Subject: x86, mrst: Add vrtc platform data setup code vRTC information is obtained from SFI tables on Moorestown, this patch parses these tables and assign the information. Signed-off-by: Feng Tang <feng.tang@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0D@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/mrst.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index bb6e45c71dde..b7fa049c826c 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/sfi.h> #include <linux/irq.h> +#include <linux/module.h> #include <asm/setup.h> #include <asm/mpspec_def.h> @@ -27,6 +28,10 @@ static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; int sfi_mtimer_num; +struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; +EXPORT_SYMBOL_GPL(sfi_mrtc_array); +int sfi_mrtc_num; + static inline void assign_to_mp_irq(struct mpc_intsrc *m, struct mpc_intsrc *mp_irq) { @@ -126,6 +131,46 @@ void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) } } +/* parse all the mrtc info to a global mrtc array */ +int __init sfi_parse_mrtc(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_rtc_table_entry *pentry; + struct mpc_intsrc mp_irq; + + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mrtc_num) { + sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_rtc_table_entry); + pentry = (struct sfi_rtc_table_entry *)sb->pentry; + totallen = sfi_mrtc_num * sizeof(*pentry); + memcpy(sfi_mrtc_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); + pentry = sfi_mrtc_array; + for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { + printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", + totallen, (u32)pentry->phys_addr, pentry->irq); + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = 0; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + return 0; +} + +void __init mrst_rtc_init(void) +{ + sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); +} + /* * Moorestown specific x86_init function overrides and early setup * calls. -- cgit v1.2.2 From bb24c4716185f6e116c440462c65c1f56649183b Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Wed, 2 Sep 2009 07:37:17 -0700 Subject: x86, apbt: Moorestown APB system timer driver Moorestown platform does not have PIT or HPET platform timers. Instead it has a bank of eight APB timers. The number of available timers to the os is exposed via SFI mtmr tables. All APB timer interrupts are routed via ioapic rtes and delivered as MSI. Currently, we use timer 0 and 1 for per cpu clockevent devices, timer 2 for clocksource. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D2D2@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/apb_timer.c | 780 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 781 insertions(+) create mode 100644 arch/x86/kernel/apb_timer.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09bc5a52..4c58352209e0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c new file mode 100644 index 000000000000..83a345b0256c --- /dev/null +++ b/arch/x86/kernel/apb_timer.c @@ -0,0 +1,780 @@ +/* + * apb_timer.c: Driver for Langwell APB timers + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * Langwell is the south complex of Intel Moorestown MID platform. There are + * eight external timers in total that can be used by the operating system. + * The timer information, such as frequency and addresses, is provided to the + * OS via SFI tables. + * Timer interrupts are routed via FW/HW emulated IOAPIC independently via + * individual redirection table entries (RTE). + * Unlike HPET, there is no master counter, therefore one of the timers are + * used as clocksource. The overall allocation looks like: + * - timer 0 - NR_CPUs for per cpu timer + * - one timer for clocksource + * - one timer for watchdog driver. + * It is also worth notice that APB timer does not support true one-shot mode, + * free-running mode will be used here to emulate one-shot mode. + * APB timer can also be used as broadcast timer along with per cpu local APIC + * timer, but by default APB timer has higher rating than local APIC timers. + */ + +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/sysdev.h> +#include <linux/pm.h> +#include <linux/pci.h> +#include <linux/sfi.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/irq.h> + +#include <asm/fixmap.h> +#include <asm/apb_timer.h> + +#define APBT_MASK CLOCKSOURCE_MASK(32) +#define APBT_SHIFT 22 +#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKSOURCE_RATING 250 +#define APBT_MIN_DELTA_USEC 200 + +#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) +#define APBT_CLOCKEVENT0_NUM (0) +#define APBT_CLOCKEVENT1_NUM (1) +#define APBT_CLOCKSOURCE_NUM (2) + +static unsigned long apbt_address; +static int apb_timer_block_enabled; +static void __iomem *apbt_virt_address; +static int phy_cs_timer_id; + +/* + * Common DW APB timer info + */ +static uint64_t apbt_freq; + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt); +static cycle_t apbt_read_clocksource(struct clocksource *cs); +static void apbt_restart_clocksource(void); + +struct apbt_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int tick; + unsigned int count; + unsigned int flags; + char name[10]; +}; + +int disable_apbt_percpu __cpuinitdata; + +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); +static struct apbt_dev *apbt_devs; +#endif + +static inline unsigned long apbt_readl_reg(unsigned long a) +{ + return readl(apbt_virt_address + a); +} + +static inline void apbt_writel_reg(unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a); +} + +static inline unsigned long apbt_readl(int n, unsigned long a) +{ + return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_writel(int n, unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_set_mapping(void) +{ + struct sfi_timer_table_entry *mtmr; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (unsigned long)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (apbt_virt_address) { + pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ + (void *)apbt_address, (void *)apbt_virt_address); + } else { + pr_debug("Failed mapping APBT phy address at %p\n",\ + (void *)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) + / APBTMRS_REG_SIZE; + pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + return; + +panic_noapbt: + panic("Failed to setup APB system timer\n"); + +} + +static inline void apbt_clear_mapping(void) +{ + iounmap(apbt_virt_address); + apbt_virt_address = NULL; +} + +/* + * APBT timer interrupt enable / disable + */ +static inline int is_apbt_capable(void) +{ + return apbt_virt_address ? 1 : 0; +} + +static struct clocksource clocksource_apbt = { + .name = "apbt", + .rating = APBT_CLOCKSOURCE_RATING, + .read = apbt_read_clocksource, + .mask = APBT_MASK, + .shift = APBT_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = apbt_restart_clocksource, +}; + +/* boot APB clock event device */ +static struct clock_event_device apbt_clockevent = { + .name = "apbt0", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = apbt_set_mode, + .set_next_event = apbt_next_event, + .shift = APBT_SHIFT, + .irq = 0, + .rating = APBT_CLOCKEVENT_RATING, +}; + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + disable_apbt_percpu = 0; + else if (strcmp("lapic_and_apbt", arg) == 0) + disable_apbt_percpu = 1; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); + +/* + * start count down from 0xffff_ffff. this is done by toggling the enable bit + * then load initial load count to ~0. + */ +static void apbt_start_counter(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); + /* enable, mask interrupt */ + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + /* read it once to get cached counter value initialized */ + apbt_read_clocksource(&clocksource_apbt); +} + +static irqreturn_t apbt_interrupt_handler(int irq, void *data) +{ + struct apbt_dev *dev = (struct apbt_dev *)data; + struct clock_event_device *aevt = &dev->evt; + + if (!aevt->event_handler) { + printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", + dev->num); + return IRQ_NONE; + } + aevt->event_handler(aevt); + return IRQ_HANDLED; +} + +static void apbt_restart_clocksource(void) +{ + apbt_start_counter(phy_cs_timer_id); +} + +/* Setup IRQ routing via IOAPIC */ +#ifdef CONFIG_SMP +static void apbt_setup_irq(struct apbt_dev *adev) +{ + struct irq_chip *chip; + struct irq_desc *desc; + + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + desc = irq_to_desc(adev->irq); + chip = get_irq_chip(adev->irq); + disable_irq(adev->irq); + desc->status |= IRQ_MOVE_PCNTXT; + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ + set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); + enable_irq(adev->irq); + if (system_state == SYSTEM_BOOTING) + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } +} +#endif + +static void apbt_enable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + /* clear pending intr */ + apbt_readl(n, APBTMR_N_EOI); + ctrl &= ~APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + +static void apbt_disable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl |= APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + + +static int __init apbt_clockevent_register(void) +{ + struct sfi_timer_table_entry *mtmr; + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz + , NSEC_PER_SEC, APBT_SHIFT); + + /* Calculate the min / max delta */ + apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &apbt_clockevent); + apbt_clockevent.min_delta_ns = clockevent_delta2ns( + APBT_MIN_DELTA_USEC*apbt_freq, + &apbt_clockevent); + /* + * Start apbt with the boot cpu mask and make it + * global if not used for per cpu timer. + */ + apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + + if (disable_apbt_percpu) { + apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + global_clock_event = &apbt_clockevent; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + apbt_clockevent.name, &apbt_clockevent)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + apbt_clockevent.irq); + } + + clockevents_register_device(&apbt_clockevent); + /* Start APBT 0 interrupts */ + apbt_enable_int(APBT_CLOCKEVENT0_NUM); + + sfi_free_mtmr(mtmr); + return 0; +} + +#ifdef CONFIG_SMP +/* Should be called with per cpu */ +void apbt_setup_secondary_clock(void) +{ + struct apbt_dev *adev; + struct clock_event_device *aevt; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (cpu == boot_cpu_id) + return; + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); + adev = &per_cpu(cpu_apbt_dev, cpu); + aevt = &adev->evt; + + memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); + aevt->cpumask = cpumask_of(cpu); + aevt->name = adev->name; + aevt->mode = CLOCK_EVT_MODE_UNUSED; + + printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", + cpu, aevt->name, *(u32 *)aevt->cpumask); + + apbt_setup_irq(adev); + + clockevents_register_device(aevt); + + apbt_enable_int(cpu); + + return; +} + +/* + * this notify handler process CPU hotplug events. in case of S0i3, nonboot + * cpus are disabled/enabled frequently, for performance reasons, we keep the + * per cpu timer irq registered so that we do need to do free_irq/request_irq. + * + * TODO: it might be more reliable to directly disable percpu clockevent device + * without the notifier chain. currently, cpu 0 may get interrupts from other + * cpu timers during the offline process due to the ordering of notification. + * the extra interrupt is harmless. + */ +static int apbt_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + apbt_disable_int(cpu); + if (system_state == SYSTEM_RUNNING) + pr_debug("skipping APBT CPU %lu offline\n", cpu); + else if (adev) { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + free_irq(adev->irq, adev); + } + break; + default: + pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; +} + +static __init int apbt_late_init(void) +{ + if (disable_apbt_percpu) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; +} +fs_initcall(apbt_late_init); +#else + +void apbt_setup_secondary_clock(void) {} + +#endif /* CONFIG_SMP */ + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long ctrl; + uint64_t delta; + int timer_num; + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + pr_debug("%s CPU %d timer %d mode=%d\n", + __func__, first_cpu(*evt->cpumask), timer_num, mode); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; + delta >>= apbt_clockevent.shift; + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_MODE_PERIODIC; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* + * DW APB p. 46, have to disable timer before load counter, + * may cause sync problem. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + udelay(1); + pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + /* APB timer does not have one-shot mode, use free running mode */ + case CLOCK_EVT_MODE_ONESHOT: + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + /* + * set free running mode, this mode will let timer reload max + * timeout which will give time (3min on 25MHz clock) to rearm + * the next event, therefore emulate the one-shot mode. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write again to set free running mode */ + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + + /* + * DW APB p. 46, load counter with all 1s before starting free + * running mode. + */ + apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); + ctrl &= ~APBTMR_CONTROL_INT; + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + apbt_disable_int(timer_num); + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_RESUME: + apbt_enable_int(timer_num); + break; + } +} + +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long ctrl; + int timer_num; + + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + /* Disable timer */ + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write new count */ + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + return 0; +} + +/* + * APB timer clock is not in sync with pclk on Langwell, which translates to + * unreliable read value caused by sampling error. the error does not add up + * overtime and only happens when sampling a 0 as a 1 by mistake. so the time + * would go backwards. the following code is trying to prevent time traveling + * backwards. little bit paranoid. + */ +static cycle_t apbt_read_clocksource(struct clocksource *cs) +{ + unsigned long t0, t1, t2; + static unsigned long last_read; + +bad_count: + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if (unlikely(t1 < t2)) { + pr_debug("APBT: read current count error %lx:%lx:%lx\n", + t1, t2, t2 - t1); + goto bad_count; + } + /* + * check against cached last read, makes sure time does not go back. + * it could be a normal rollover but we will do tripple check anyway + */ + if (unlikely(t2 > last_read)) { + /* check if we have a normal rollover */ + unsigned long raw_intr_status = + apbt_readl_reg(APBTMRS_RAW_INT_STATUS); + /* + * cs timer interrupt is masked but raw intr bit is set if + * rollover occurs. then we read EOI reg to clear it. + */ + if (raw_intr_status & (1 << phy_cs_timer_id)) { + apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); + goto out; + } + pr_debug("APB CS going back %lx:%lx:%lx ", + t2, last_read, t2 - last_read); +bad_count_x3: + pr_debug(KERN_INFO "tripple check enforced\n"); + t0 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if ((t2 > t1) || (t1 > t0)) { + printk(KERN_ERR "Error: APB CS tripple check failed\n"); + goto bad_count_x3; + } + } +out: + last_read = t2; + return (cycle_t)~t2; +} + +static int apbt_clocksource_register(void) +{ + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + apbt_start_counter(phy_cs_timer_id); + + /* Verify whether apbt counter works */ + t1 = apbt_read_clocksource(&clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == apbt_read_clocksource(&clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + /* + * initialize and register APBT clocksource + * convert that to ns/clock cycle + * mult = (ns/c) * 2^APBT_SHIFT + */ + clocksource_apbt.mult = div_sc(MSEC_PER_SEC, + (unsigned long) apbt_freq, APBT_SHIFT); + clocksource_register(&clocksource_apbt); + + return 0; +} + +/* + * Early setup the APBT timer, only use timer 0 for booting then switch to + * per CPU timer if possible. + * returns 1 if per cpu apbt is setup + * returns 0 if no per cpu apbt is chosen + * panic if set up failed, this is the only platform timer on Moorestown. + */ +void __init apbt_time_init(void) +{ +#ifdef CONFIG_SMP + int i; + struct sfi_timer_table_entry *p_mtmr; + unsigned int percpu_timer; + struct apbt_dev *adev; +#endif + + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (apbt_virt_address) { + pr_debug("Found APBT version 0x%lx\n",\ + apbt_readl_reg(APBTMRS_COMP_VERSION)); + } else + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } +#ifdef CONFIG_SMP + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (disable_apbt_percpu) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) { + percpu_timer = 1; + apbt_num_timers_used = num_possible_cpus(); + } else { + percpu_timer = 0; + apbt_num_timers_used = 1; + adev = &per_cpu(cpu_apbt_dev, 0); + adev->flags &= ~APBT_DEV_USED; + } + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, + GFP_KERNEL); + if (!apbt_devs) { + printk(KERN_ERR "Failed to allocate APB timer devices\n"); + return; + } + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) { + adev->tick = p_mtmr->freq_hz; + adev->irq = p_mtmr->irq; + } else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + adev->count = 0; + sprintf(adev->name, "apbt%d", i); + } +#endif + + return; + +out_noapbt: + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); +} + +static inline void apbt_disable(int n) +{ + if (is_apbt_capable()) { + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + } +} + +/* called before apb_timer_enable, use early map */ +unsigned long apbt_quick_calibrate() +{ + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + apbt_start_counter(phy_cs_timer_id); + + /* check if the timer can count down, otherwise return */ + old = apbt_read_clocksource(&clocksource_apbt); + i = 10000; + while (--i) { + if (old != apbt_read_clocksource(&clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq * 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + apbt_start_counter(phy_cs_timer_id); + + old = apbt_read_clocksource(&clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = apbt_read_clocksource(&clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * apbt_freq * 1000) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; +failed: + return 0; +} -- cgit v1.2.2 From 3746c6b6e26b8ad605f11b43e54acb3481d40980 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@intel.com> Date: Fri, 12 Feb 2010 05:01:12 -0800 Subject: x86, mrst: Platform clock setup code Add Moorestown platform clock setup code to the x86_init abstraction. Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D2D4@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/mrst.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index b7fa049c826c..0aad8670858e 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -23,6 +23,7 @@ #include <asm/mrst.h> #include <asm/io.h> #include <asm/i8259.h> +#include <asm/apb_timer.h> static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; @@ -166,11 +167,55 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) return 0; } +/* + * the secondary clock in Moorestown can be APBT or LAPIC clock, default to + * APBT but cmdline option can also override it. + */ +static void __cpuinit mrst_setup_secondary_clock(void) +{ + /* restore default lapic clock if disabled by cmdline */ + if (disable_apbt_percpu) + return setup_secondary_APIC_clock(); + apbt_setup_secondary_clock(); +} + +static unsigned long __init mrst_calibrate_tsc(void) +{ + unsigned long flags, fast_calibrate; + + local_irq_save(flags); + fast_calibrate = apbt_quick_calibrate(); + local_irq_restore(flags); + + if (fast_calibrate) + return fast_calibrate; + + return 0; +} + +void __init mrst_time_init(void) +{ + sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + pre_init_apic_IRQ0(); + apbt_time_init(); +} + void __init mrst_rtc_init(void) { sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } +/* + * if we use per cpu apb timer, the bootclock already setup. if we use lapic + * timer and one apbt timer for broadcast, we need to set up lapic boot clock. + */ +static void __init mrst_setup_boot_clock(void) +{ + pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); + if (disable_apbt_percpu) + setup_boot_APIC_clock(); +}; + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -180,6 +225,14 @@ void __init x86_mrst_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + x86_init.timers.timer_init = mrst_time_init; + x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + + x86_init.irqs.pre_vector_init = x86_init_noop; + + x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + + x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; -- cgit v1.2.2 From 28c6a0ba30457380b140d9d7a61530eda8969180 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Tue, 23 Feb 2010 20:27:48 -0800 Subject: x86, legacy_irq: Remove left over nr_legacy_irqs nr_legacy_irqs and its ilk have moved to legacy_pic. -v2: there is one in ioapic_.c Singed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <4B84AAC4.2020204@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/irqinit.c | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8c848b5877a0..b9d08f0e1e33 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1440,7 +1440,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq * controllers like 8259. Now that IO-APIC can handle this irq, update * the cfg->domain. */ - if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) apic->vector_allocation_domain(0, cfg->domain); if (assign_irq_vector(irq, cfg, apic->target_cpus())) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d2f787b3de56..ef257fc2921b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -99,9 +99,6 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Number of legacy interrupts */ -int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; - void __init init_ISA_irqs(void) { int i; @@ -114,7 +111,7 @@ void __init init_ISA_irqs(void) /* * 16 old-style INTA-cycle interrupts: */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -138,7 +135,7 @@ void __init init_IRQ(void) * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ - for (i = 0; i < nr_legacy_irqs; i++) + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; x86_init.irqs.intr_init(); -- cgit v1.2.2 From 9eeeb09edba1e3544526611663472743ca584d36 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Tue, 23 Feb 2010 18:49:04 -0800 Subject: x86, legacy_irq: Remove duplicate vector assigment Remove duplicated cfg[i].vector assignment. Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <4B8493A0.6080501@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apic/io_apic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b9d08f0e1e33..b758d49b811c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -161,8 +161,6 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { - if (i < legacy_pic->nr_legacy_irqs) - cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); -- cgit v1.2.2 From e808bae2407a087bfd40200a27587898e5a9909d Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com> Date: Tue, 9 Feb 2010 21:38:45 -0200 Subject: x86: Do not reserve brk for DMI if it's not going to be used This will save 64K bytes from memory when loading linux if DMI is disabled, which is good for embedded systems. Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com> LKML-Reference: <1265758732-19320-1-git-send-email-cascardo@holoscopio.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3499b4fabc94..cb42109a55b4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -121,7 +121,9 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +#ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); +#endif unsigned int boot_cpu_id __read_mostly; -- cgit v1.2.2 From 0c54dd341fb701928b8e5dca91ced1870c55b05b Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Thu, 25 Feb 2010 08:42:06 -0500 Subject: ftrace: Remove memory barriers from NMI code when not needed The code in stop_machine that modifies the kernel text has a bit of logic to handle the case of NMIs. stop_machine does not prevent NMIs from executing, and if an NMI were to trigger on another CPU as the modifying CPU is changing the NMI text, a GPF could result. To prevent the GPF, the NMI calls ftrace_nmi_enter() which may modify the code first, then any other NMIs will just change the text to the same content which will do no harm. The code that stop_machine called must wait for NMIs to finish while it changes each location in the kernel. That code may also change the text to what the NMI changed it to. The key is that the text will never change content while another CPU is executing it. To make the above work, the call to ftrace_nmi_enter() must also do a smp_mb() as well as atomic_inc(). But for applications like perf that require a high number of NMIs for profiling, this can have a dramatic effect on the system. Not only is it doing a full memory barrier on both nmi_enter() as well as nmi_exit() it is also modifying a global variable with an atomic operation. This kills performance on large SMP machines. Since the memory barriers are only needed when ftrace is in the process of modifying the text (which is seldom), this patch adds a "modifying_code" variable that gets set before stop machine is executed and cleared afterwards. The NMIs will check this variable and store it in a per CPU "save_modifying_code" variable that it will use to check if it needs to do the memory barriers and atomic dec on NMI exit. Acked-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- arch/x86/kernel/ftrace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 309689245431..605ef196fdd6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -30,14 +30,32 @@ #ifdef CONFIG_DYNAMIC_FTRACE +/* + * modifying_code is set to notify NMIs that they need to use + * memory barriers when entering or exiting. But we don't want + * to burden NMIs with unnecessary memory barriers when code + * modification is not being done (which is most of the time). + * + * A mutex is already held when ftrace_arch_code_modify_prepare + * and post_process are called. No locks need to be taken here. + * + * Stop machine will make sure currently running NMIs are done + * and new NMIs will see the updated variable before we need + * to worry about NMIs doing memory barriers. + */ +static int modifying_code __read_mostly; +static DEFINE_PER_CPU(int, save_modifying_code); + int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { + modifying_code = 0; set_kernel_text_ro(); return 0; } @@ -149,6 +167,11 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { + __get_cpu_var(save_modifying_code) = modifying_code; + + if (!__get_cpu_var(save_modifying_code)) + return; + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { smp_rmb(); ftrace_mod_code(); @@ -160,6 +183,9 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { + if (!__get_cpu_var(save_modifying_code)) + return; + /* Finish all executions before clearing nmi_running */ smp_mb(); atomic_dec(&nmi_running); -- cgit v1.2.2 From d498f763950703c724c650db1d34a1c8679f9ca8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Thu, 25 Feb 2010 08:33:49 -0500 Subject: kprobes/x86: Cleanup RELATIVEJUMP_INSTRUCTION to RELATIVEJUMP_OPCODE Change RELATIVEJUMP_INSTRUCTION macro to RELATIVEJUMP_OPCODE since it represents just the opcode byte. Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Jim Keniston <jkenisto@us.ibm.com> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Anders Kaseorg <andersk@ksplice.com> Cc: Tim Abbott <tabbott@ksplice.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Jason Baron <jbaron@redhat.com> Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> LKML-Reference: <20100225133349.6725.99302.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5de9f4a9c3fd..15177cdfdd00 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -115,7 +115,7 @@ static void __kprobes set_jmp_op(void *from, void *to) } __attribute__((packed)) * jop; jop = (struct __arch_jmp_op *)from; jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; + jop->op = RELATIVEJUMP_OPCODE; } /* -- cgit v1.2.2 From 0f94eb634ef7af736dee5639aac1c2fe9635d089 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Thu, 25 Feb 2010 08:34:23 -0500 Subject: kprobes/x86: Boost probes when reentering Integrate prepare_singlestep() into setup_singlestep() to boost up reenter probes, if possible. Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Jim Keniston <jkenisto@us.ibm.com> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Anders Kaseorg <andersk@ksplice.com> Cc: Tim Abbott <tabbott@ksplice.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Jason Baron <jbaron@redhat.com> Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> LKML-Reference: <20100225133423.6725.12071.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/kprobes.c | 48 +++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 15177cdfdd00..c69bb65006f3 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -406,18 +406,6 @@ static void __kprobes restore_btf(void) update_debugctlmsr(current->thread.debugctlmsr); } -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -430,19 +418,38 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, } static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, int reenter) { #if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ regs->ip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return; } #endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; } /* @@ -456,11 +463,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: case KPROBE_HIT_ACTIVE: - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; + setup_singlestep(p, regs, kcb, 1); break; case KPROBE_HIT_SS: /* A probe has been hit in the codepath leading up to, or just @@ -535,13 +539,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) * more here. */ if (!p->pre_handler || !p->pre_handler(p, regs)) - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ -- cgit v1.2.2 From f007ea2685692bafb386820144cf73a14016fc7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Thu, 25 Feb 2010 08:34:30 -0500 Subject: kprobes/x86: Cleanup save/restore registers Introduce SAVE/RESOTRE_REGS_STRING for cleanup kretprobe-trampoline asm code. These macros will be used for emulating interruption. Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Jim Keniston <jkenisto@us.ibm.com> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Anders Kaseorg <andersk@ksplice.com> Cc: Tim Abbott <tabbott@ksplice.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Jason Baron <jbaron@redhat.com> Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> LKML-Reference: <20100225133430.6725.83342.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/kprobes.c | 128 ++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 61 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index c69bb65006f3..4ae95befd0eb 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -554,6 +554,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %ds\n" \ + " pushl %es\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -567,65 +630,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax and gs. - * trampoline_handler() will plug in these values - */ - " subl $16, %esp\n" - " pushl %fs\n" - " pushl %es\n" - " pushl %ds\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ @@ -633,15 +647,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ " movl %eax, 56(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ds, es, fs, gs, orig_ax and ip */ - " addl $24, %esp\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); -- cgit v1.2.2 From 3d55cc8a058ee96291d6d45b1e35121b9920eca3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Thu, 25 Feb 2010 08:34:38 -0500 Subject: x86: Add text_poke_smp for SMP cross modifying code Add generic text_poke_smp for SMP which uses stop_machine() to synchronize modifying code. This stop_machine() method is officially described at "7.1.3 Handling Self- and Cross-Modifying Code" on the intel's software developer's manual 3A. Since stop_machine() can't protect code against NMI/MCE, this function can not modify those handlers. And also, this function is basically for modifying multibyte-single-instruction. For modifying multibyte-multi-instructions, we need another special trap & detour code. This code originaly comes from immediate values with stop_machine() version. Thanks Jason and Mathieu! Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Jim Keniston <jkenisto@us.ibm.com> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Anders Kaseorg <andersk@ksplice.com> Cc: Tim Abbott <tabbott@ksplice.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Jason Baron <jbaron@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> LKML-Reference: <20100225133438.6725.80273.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/alternative.c | 60 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e63b80e5861c..c41f13c15e8f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> +#include <linux/stop_machine.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -570,3 +571,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) local_irq_restore(flags); return addr; } + +/* + * Cross-modifying kernel text with stop_machine(). + * This code originally comes from immediate value. + */ +static atomic_t stop_machine_first; +static int wrote_text; + +struct text_poke_params { + void *addr; + const void *opcode; + size_t len; +}; + +static int __kprobes stop_machine_text_poke(void *data) +{ + struct text_poke_params *tpp = data; + + if (atomic_dec_and_test(&stop_machine_first)) { + text_poke(tpp->addr, tpp->opcode, tpp->len); + smp_wmb(); /* Make sure other cpus see that this has run */ + wrote_text = 1; + } else { + while (!wrote_text) + smp_rmb(); + sync_core(); + } + + flush_icache_range((unsigned long)tpp->addr, + (unsigned long)tpp->addr + tpp->len); + return 0; +} + +/** + * text_poke_smp - Update instructions on a live kernel on SMP + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Modify multi-byte instruction by using stop_machine() on SMP. This allows + * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying + * should be allowed, since stop_machine() does _not_ protect code against + * NMI and MCE. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) +{ + struct text_poke_params tpp; + + tpp.addr = addr; + tpp.opcode = opcode; + tpp.len = len; + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + return addr; +} + -- cgit v1.2.2 From c0f7ac3a9edde786bc129d37627953a8b8abefdf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Thu, 25 Feb 2010 08:34:46 -0500 Subject: kprobes/x86: Support kprobes jump optimization on x86 Introduce x86 arch-specific optimization code, which supports both of x86-32 and x86-64. This code also supports safety checking, which decodes whole of a function in which probe is inserted, and checks following conditions before optimization: - The optimized instructions which will be replaced by a jump instruction don't straddle the function boundary. - There is no indirect jump instruction, because it will jumps into the address range which is replaced by jump operand. - There is no jump/loop instruction which jumps into the address range which is replaced by jump operand. - Don't optimize kprobes if it is in functions into which fixup code will jumps. This uses text_poke_multibyte() which doesn't support modifying code on NMI/MCE handler. However, since kprobes itself doesn't support NMI/MCE code probing, it's not a problem. Changes in v9: - Use *_text_reserved() for checking the probe can be optimized. - Verify jump address range is in 2G range when preparing slot. - Backup original code when switching optimized buffer, instead of preparing buffer, because there can be int3 of other probes in preparing phase. - Check kprobe is disabled in arch_check_optimized_kprobe(). - Strictly check indirect jump opcodes (ff /4, ff /5). Changes in v6: - Split stop_machine-based jump patching code. - Update comments and coding style. Changes in v5: - Introduce stop_machine-based jump replacing. Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Jim Keniston <jkenisto@us.ibm.com> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Anders Kaseorg <andersk@ksplice.com> Cc: Tim Abbott <tabbott@ksplice.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Jason Baron <jbaron@redhat.com> Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> LKML-Reference: <20100225133446.6725.78994.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/kprobes.c | 433 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 411 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 4ae95befd0eb..b43bbaebe2c0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -49,6 +49,7 @@ #include <linux/module.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> +#include <linux/ftrace.h> #include <asm/cacheflush.h> #include <asm/desc.h> @@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { }; const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) { - struct __arch_jmp_op { - char op; + struct __arch_relative_insn { + u8 op; s32 raddr; - } __attribute__((packed)) * jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_OPCODE; + } __attribute__((packed)) *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } /* @@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) /* * Basically, kp->ainsn.insn has an original instruction. * However, RIP-relative instruction can not do single-stepping - * at different place, fix_riprel() tweaks the displacement of + * at different place, __copy_instruction() tweaks the displacement of * that instruction. In that case, we can't recover the instruction * from the kp->ainsn.insn. * @@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) } /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. * If it does, Return the address of the 32-bit displacement word. * If not, return null. * Only applicable to 64-bit x86. */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { -#ifdef CONFIG_X86_64 struct insn insn; - kernel_insn_init(&insn, p->ainsn.insn); + int ret; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + kernel_insn_init(&insn, src); + if (recover) { + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, + (unsigned long)src); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + } + insn_get_length(&insn); + memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; + kernel_insn_init(&insn, dest); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing @@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) p->addr + (s64) insn.displacement.value - - (u8 *) p->ainsn.insn; + newdisp = (u8 *) src + (s64) insn.displacement.value - + (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ - disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; } #endif + return insn.length; } static void __kprobes arch_copy_kprobe(struct kprobe *p) { - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - - fix_riprel(p); + /* + * Copy an instruction without recovering int3, because it will be + * put by another subsystem. + */ + __copy_instruction(p->ainsn.insn, p->addr, 0); if (can_boost(p->addr)) p->ainsn.boostable = 0; @@ -417,9 +443,20 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } +#ifdef CONFIG_OPTPROBES +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif + static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) { + if (setup_detour_execution(p, regs, reenter)) + return; + #if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ @@ -815,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p, * These instructions can be executed directly if it * jumps back to correct address. */ - set_jmp_op((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); p->ainsn.boostable = 1; } else { p->ainsn.boostable = -1; @@ -1043,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, + unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +void __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __get_cpu_var(current_kprobe) = &op->kp; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __get_cpu_var(current_kprobe) = NULL; + } + preempt_enable_no_resched(); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len, 1); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + int ret; + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + /* Dummy buffers for lookup_symbol_attrs */ + static char __dummy_buf[KSYM_NAME_LEN]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a breakpoint (int3) with a relative jump. */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ + unsigned char jmp_code[RELATIVEJUMP_SIZE]; + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + jmp_code[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&jmp_code[1]) = rel; + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +#endif + int __init arch_init_kprobes(void) { return 0; -- cgit v1.2.2 From d5d0e88c1e5b069aadb050ff6ec95df312de876a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 22 Feb 2010 05:42:04 -0800 Subject: x86, olpc: Use pci subarch init for OLPC Replace the #ifdef'ed OLPC-specific init functions by a conditional x86_init function. If the function returns 0 we leave pci_arch_init, otherwise we continue. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Jesse Barnes <jbarnes@virtuousgeek.org> Cc: Andres Salomon <dilinger@collabora.co.uk> LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318CE89@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/olpc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 9d1d263f786f..8297160c41b3 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -17,7 +17,9 @@ #include <linux/spinlock.h> #include <linux/io.h> #include <linux/string.h> + #include <asm/geode.h> +#include <asm/setup.h> #include <asm/olpc.h> #ifdef CONFIG_OPEN_FIRMWARE @@ -243,9 +245,11 @@ static int __init olpc_init(void) olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, (unsigned char *) &olpc_platform_info.ecver, 1); - /* check to see if the VSA exists */ - if (cs5535_has_vsa2()) - olpc_platform_info.flags |= OLPC_F_VSA; +#ifdef CONFIG_PCI_OLPC + /* If the VSA exists let it emulate PCI, if not emulate in kernel */ + if (!cs5535_has_vsa2()) + x86_init.pci.arch_init = pci_olpc_init; +#endif printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", -- cgit v1.2.2 From fb90ef93df654f2678933efbbf864adac0ae490e Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Wed, 24 Feb 2010 18:36:53 -0800 Subject: early_res: Add free_early_partial() To free partial areas in pcpu_setup... Reported-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Cc: Tejun Heo <tj@kernel.org> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Jesse Barnes <jbarnes@virtuousgeek.org> Cc: Pekka Enberg <penberg@cs.helsinki.fi> LKML-Reference: <4B85E245.5030001@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_percpu.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 35abcb8b00e9..ef6370b00e70 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { +#ifdef CONFIG_NO_BOOTMEM + u64 start = __pa(ptr); + u64 end = start + size; + free_early_partial(start, end); +#else free_bootmem(__pa(ptr), size); +#endif } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) -- cgit v1.2.2 From d76a0812ac4139ceb54daab3cc70e1bd8bd9d43a Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Mon, 8 Feb 2010 17:06:01 +0200 Subject: perf_events: Add new start/stop PMU callbacks In certain situations, the kernel may need to stop and start the same event rapidly. The current PMU callbacks do not distinguish between stop and release (i.e., stop + free the resource). Thus, a counter may be released, then it will be immediately re-acquired. Event scheduling will again take place with no guarantee to assign the same counter. On some processors, this may event yield to failure to assign the event back due to competion between cores. This patch is adding a new pair of callback to stop and restart a counter without actually release the underlying counter resource. On stop, the counter is stopped, its values saved and that's it. On start, the value is reloaded and counter is restarted (on x86, actual restart is delayed until perf_enable()). Signed-off-by: Stephane Eranian <eranian@google.com> [ added fallback to ->enable/->disable for all other PMUs fixed x86_pmu_start() to call x86_pmu.enable() merged __x86_pmu_disable into x86_pmu_stop() ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <4b703875.0a04d00a.7896.ffffb824@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a920f173a220..9173ea95f918 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1495,7 +1495,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, hwc->last_tag == cpuc->tags[i]; } -static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc); +static void x86_pmu_stop(struct perf_event *event); void hw_perf_enable(void) { @@ -1533,7 +1533,7 @@ void hw_perf_enable(void) match_prev_assignment(hwc, cpuc, i)) continue; - __x86_pmu_disable(event, cpuc); + x86_pmu_stop(event); hwc->idx = -1; } @@ -1801,6 +1801,19 @@ static int x86_pmu_enable(struct perf_event *event) return 0; } +static int x86_pmu_start(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx == -1) + return -EAGAIN; + + x86_perf_event_set_period(event, hwc, hwc->idx); + x86_pmu.enable(hwc, hwc->idx); + + return 0; +} + static void x86_pmu_unthrottle(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1924,8 +1937,9 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) event->pending_kill = POLL_IN; } -static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc) +static void x86_pmu_stop(struct perf_event *event) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; @@ -1954,7 +1968,7 @@ static void x86_pmu_disable(struct perf_event *event) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int i; - __x86_pmu_disable(event, cpuc); + x86_pmu_stop(event); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { @@ -2667,6 +2681,8 @@ static inline void x86_pmu_read(struct perf_event *event) static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, + .start = x86_pmu_start, + .stop = x86_pmu_stop, .read = x86_pmu_read, .unthrottle = x86_pmu_unthrottle, }; -- cgit v1.2.2 From 38331f62c20456454eed9ebea2525f072c6f1d2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Mon, 8 Feb 2010 17:17:01 +0200 Subject: perf_events, x86: AMD event scheduling This patch adds correct AMD NorthBridge event scheduling. NB events are events measuring L3 cache, Hypertransport traffic. They are identified by an event code >= 0xe0. They measure events on the Northbride which is shared by all cores on a package. NB events are counted on a shared set of counters. When a NB event is programmed in a counter, the data actually comes from a shared counter. Thus, access to those counters needs to be synchronized. We implement the synchronization such that no two cores can be measuring NB events using the same counters. Thus, we maintain a per-NB allocation table. The available slot is propagated using the event_constraint structure. Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <4b703957.0702d00a.6bf2.7b7d@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 265 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 262 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9173ea95f918..aa12f36e4711 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -80,6 +80,13 @@ struct event_constraint { int weight; }; +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + struct cpu_hw_events { struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; @@ -92,6 +99,7 @@ struct cpu_hw_events { int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct amd_nb *amd_nb; }; #define __EVENT_CONSTRAINT(c, n, m, w) {\ @@ -153,6 +161,8 @@ struct x86_pmu { static struct x86_pmu x86_pmu __read_mostly; +static raw_spinlock_t amd_nb_lock; + static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -802,7 +812,7 @@ static u64 amd_pmu_event_map(int hw_event) static u64 amd_pmu_raw_event(u64 hw_event) { -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL #define K7_EVNTSEL_INV_MASK 0x000800000ULL @@ -2210,6 +2220,7 @@ perf_event_nmi_handler(struct notifier_block *self, } static struct event_constraint unconstrained; +static struct event_constraint emptyconstraint; static struct event_constraint bts_constraint = EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); @@ -2249,10 +2260,146 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event return &unconstrained; } +/* + * AMD64 events are detected based on their event codes. + */ +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * only care about NB events + */ + if (!(nb && amd_is_nb_event(hwc))) + return; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_events; i++) { + if (nb->owners[i] == event) { + cmpxchg(nb->owners+i, event, NULL); + break; + } + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling amd_put_event_constraints(). + * + * Non NB events are not impacted by this restriction. + */ static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - return &unconstrained; + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old = NULL; + int max = x86_pmu.num_events; + int i, j, k = -1; + + /* + * if not NB event or no NB, then no constraints + */ + if (!(nb && amd_is_nb_event(hwc))) + return &unconstrained; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for (i = 0; i < max; i++) { + /* + * keep track of first free slot + */ + if (k == -1 && !nb->owners[i]) + k = i; + + /* already present, reuse */ + if (nb->owners[i] == event) + goto done; + } + /* + * not present, so grab a new slot + * starting either at: + */ + if (hwc->idx != -1) { + /* previous assignment */ + i = hwc->idx; + } else if (k != -1) { + /* start from free slot found */ + i = k; + } else { + /* + * event not found, no slot found in + * first pass, try again from the + * beginning + */ + i = 0; + } + j = i; + do { + old = cmpxchg(nb->owners+i, NULL, event); + if (!old) + break; + if (++i == max) + i = 0; + } while (i != j); +done: + if (!old) + return &nb->event_constraints[i]; + + return &emptyconstraint; } static int x86_event_sched_in(struct perf_event *event, @@ -2465,7 +2612,8 @@ static __initconst struct x86_pmu amd_pmu = { .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints }; static __init int p6_pmu_init(void) @@ -2589,6 +2737,91 @@ static __init int intel_pmu_init(void) return 0; } +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +{ + struct amd_nb *nb; + int i; + + nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + if (!nb) + return NULL; + + memset(nb, 0, sizeof(*nb)); + nb->nb_id = nb_id; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_events; i++) { + set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static void amd_pmu_cpu_online(int cpu) +{ + struct cpu_hw_events *cpu1, *cpu2; + struct amd_nb *nb = NULL; + int i, nb_id; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + /* + * function may be called too early in the + * boot process, in which case nb_id is bogus + */ + nb_id = amd_get_nb_id(cpu); + if (nb_id == BAD_APICID) + return; + + cpu1 = &per_cpu(cpu_hw_events, cpu); + cpu1->amd_nb = NULL; + + raw_spin_lock(&amd_nb_lock); + + for_each_online_cpu(i) { + cpu2 = &per_cpu(cpu_hw_events, i); + nb = cpu2->amd_nb; + if (!nb) + continue; + if (nb->nb_id == nb_id) + goto found; + } + + nb = amd_alloc_nb(cpu, nb_id); + if (!nb) { + pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); + raw_spin_unlock(&amd_nb_lock); + return; + } +found: + nb->refcnt++; + cpu1->amd_nb = nb; + + raw_spin_unlock(&amd_nb_lock); +} + +static void amd_pmu_cpu_offline(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock(&amd_nb_lock); + + if (--cpuhw->amd_nb->refcnt == 0) + kfree(cpuhw->amd_nb); + + cpuhw->amd_nb = NULL; + + raw_spin_unlock(&amd_nb_lock); +} + static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ @@ -2601,6 +2834,11 @@ static __init int amd_pmu_init(void) memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + /* + * explicitly initialize the boot cpu, other cpus will get + * the cpu hotplug callbacks from smp_init() + */ + amd_pmu_cpu_online(smp_processor_id()); return 0; } @@ -2934,4 +3172,25 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) void hw_perf_event_setup_online(int cpu) { init_debug_store_on_cpu(cpu); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + amd_pmu_cpu_online(cpu); + break; + default: + return; + } +} + +void hw_perf_event_setup_offline(int cpu) +{ + init_debug_store_on_cpu(cpu); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + amd_pmu_cpu_offline(cpu); + break; + default: + return; + } } -- cgit v1.2.2 From 6e37738a2fac964583debe91099bc3248554f6e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <peterz@infradead.org> Date: Thu, 11 Feb 2010 13:21:58 +0100 Subject: perf_events: Simplify code by removing cpu argument to hw_perf_group_sched_in() Since the cpu argument to hw_perf_group_sched_in() is always smp_processor_id(), simplify the code a little by removing this argument and using the current cpu where needed. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: David Miller <davem@davemloft.net> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> LKML-Reference: <1265890918.5396.3.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index aa12f36e4711..ad096562d694 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2403,12 +2403,12 @@ done: } static int x86_event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { int ret = 0; event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = cpu; + event->oncpu = smp_processor_id(); event->tstamp_running += event->ctx->time - event->tstamp_stopped; if (!is_x86_event(event)) @@ -2424,7 +2424,7 @@ static int x86_event_sched_in(struct perf_event *event, } static void x86_event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; @@ -2452,9 +2452,9 @@ static void x86_event_sched_out(struct perf_event *event, */ int hw_perf_group_sched_in(struct perf_event *leader, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, int cpu) + struct perf_event_context *ctx) { - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct perf_event *sub; int assign[X86_PMC_IDX_MAX]; int n0, n1, ret; @@ -2468,14 +2468,14 @@ int hw_perf_group_sched_in(struct perf_event *leader, if (ret) return ret; - ret = x86_event_sched_in(leader, cpuctx, cpu); + ret = x86_event_sched_in(leader, cpuctx); if (ret) return ret; n1 = 1; list_for_each_entry(sub, &leader->sibling_list, group_entry) { if (sub->state > PERF_EVENT_STATE_OFF) { - ret = x86_event_sched_in(sub, cpuctx, cpu); + ret = x86_event_sched_in(sub, cpuctx); if (ret) goto undo; ++n1; @@ -2500,11 +2500,11 @@ int hw_perf_group_sched_in(struct perf_event *leader, */ return 1; undo: - x86_event_sched_out(leader, cpuctx, cpu); + x86_event_sched_out(leader, cpuctx); n0 = 1; list_for_each_entry(sub, &leader->sibling_list, group_entry) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { - x86_event_sched_out(sub, cpuctx, cpu); + x86_event_sched_out(sub, cpuctx); if (++n0 == n1) break; } -- cgit v1.2.2 From 6667661df4bc76083edf1e08831c20f64429709d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Wed, 10 Feb 2010 16:10:48 +0100 Subject: perf_events, x86: Remove superflous MSR writes We re-program the event control register every time we reset the count, this appears to be superflous, hence remove it. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arjan van de Ven <arjan@linux.intel.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ad096562d694..dd09ccc867d3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2009,9 +2009,6 @@ static int intel_pmu_save_and_restart(struct perf_event *event) x86_perf_event_update(event, hwc, idx); ret = x86_perf_event_set_period(event, hwc, idx); - if (event->state == PERF_EVENT_STATE_ACTIVE) - intel_pmu_enable_event(hwc, idx); - return ret; } -- cgit v1.2.2 From f22f54f4491acd987a6c5a92de52b60ca8b58b61 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 26 Feb 2010 12:05:05 +0100 Subject: perf_events, x86: Split PMU definitions into separate files Split amd,p6,intel into separate files so that we can easily deal with CONFIG_CPU_SUP_* things, needed to make things build now that perf_event.c relies on symbols from amd.c Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 1524 +------------------------------- arch/x86/kernel/cpu/perf_event_amd.c | 416 +++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 971 ++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_p6.c | 157 ++++ 4 files changed, 1554 insertions(+), 1514 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_event_amd.c create mode 100644 arch/x86/kernel/cpu/perf_event_intel.c create mode 100644 arch/x86/kernel/cpu/perf_event_p6.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index dd09ccc867d3..641ccb9dddbc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -161,8 +161,6 @@ struct x86_pmu { static struct x86_pmu x86_pmu __read_mostly; -static raw_spinlock_t amd_nb_lock; - static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -170,140 +168,6 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { static int x86_perf_event_set_period(struct perf_event *event, struct hw_perf_event *hwc, int idx); -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Event setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_EVENT 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_REG_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_REG_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - -static struct event_constraint intel_p6_event_constraints[] = -{ - INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ - INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT_END -}; - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, -}; - -static struct event_constraint intel_core_event_constraints[] = -{ - INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_core2_event_constraints[] = -{ - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ - INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ - INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_nehalem_event_constraints[] = -{ - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ - INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ - INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ - INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ - INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ - INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ - INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_westmere_event_constraints[] = -{ - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ - INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ - INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ - EVENT_CONSTRAINT_END -}; - -static struct event_constraint intel_gen_event_constraints[] = -{ - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ - EVENT_CONSTRAINT_END -}; - -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} - /* * Generalized hw caching related hw_event table, filled * in on a per model basis. A value of 0 means @@ -319,515 +183,6 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static __initconst u64 westmere_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ - [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ - [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (INTEL_ARCH_EVTSEL_MASK | \ - INTEL_ARCH_UNIT_MASK | \ - INTEL_ARCH_EDGE_MASK | \ - INTEL_ARCH_INV_MASK | \ - INTEL_ARCH_CNT_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static __initconst u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_REG_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. @@ -1079,42 +434,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return 0; } -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - /* * Setup the hardware configuration for a given attr_type */ @@ -1223,26 +542,6 @@ static int __hw_perf_event_init(struct perf_event *event) return 0; } -static void p6_pmu_disable_all(void) -{ - u64 val; - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_disable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); -} - static void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1278,33 +577,6 @@ void hw_perf_disable(void) x86_pmu.disable_all(); } -static void p6_pmu_enable_all(void) -{ - unsigned long val; - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_enable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; - - if (WARN_ON_ONCE(!event)) - return; - - intel_pmu_enable_bts(event->hw.config); - } -} - static void x86_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1578,20 +850,6 @@ void hw_perf_enable(void) x86_pmu.enable_all(); } -static inline u64 intel_pmu_get_status(void) -{ - u64 status; - - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static inline void intel_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); -} - static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, @@ -1603,47 +861,6 @@ static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } -static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; - - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val = P6_NOP_EVENT; - - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - -static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); - return; - } - - x86_pmu_disable_event(hwc, idx); -} - static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* @@ -1702,70 +919,6 @@ x86_perf_event_set_period(struct perf_event *event, return ret; } -static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - - /* - * ANY bit is supported in v3 and up - */ - if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) - bits |= 0x4; - - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - - -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; - } - - __x86_pmu_enable_event(hwc, idx); -} - static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1887,66 +1040,6 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) -{ - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!event) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, event, ®s); - - if (perf_output_begin(&handle, event, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, event); - } - - perf_output_end(&handle); - - /* There's new data available. */ - event->hw.interrupts++; - event->pending_kill = POLL_IN; -} - static void x86_pmu_stop(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1966,10 +1059,6 @@ static void x86_pmu_stop(struct perf_event *event) */ x86_perf_event_update(event, hwc, idx); - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); - cpuc->events[idx] = NULL; } @@ -1996,114 +1085,6 @@ static void x86_pmu_disable(struct perf_event *event) perf_event_update_userpage(event); } -/* - * Save and restart an expired event. Called by NMI contexts, - * so it has to be careful about preempting normal event ops: - */ -static int intel_pmu_save_and_restart(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; - unsigned long flags; - int idx; - - if (!x86_pmu.num_events) - return; - - local_irq_save(flags); - - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); -} - -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - data.raw = NULL; - - cpuc = &__get_cpu_var(cpu_hw_events); - - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } - - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } - - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_event *event = cpuc->events[bit]; - - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; - - if (!intel_pmu_save_and_restart(event)) - continue; - - data.period = event->hw.last_period; - - if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); - } - - intel_pmu_ack_status(ack); - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; -} - static int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; @@ -2216,37 +1197,20 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, + .next = NULL, + .priority = 1 +}; + static struct event_constraint unconstrained; static struct event_constraint emptyconstraint; -static struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); - -static struct event_constraint * -intel_special_constraints(struct perf_event *event) -{ - unsigned int hw_event; - - hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (event->hw.sample_period == 1))) { - - return &bts_constraint; - } - return NULL; -} - static struct event_constraint * -intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { struct event_constraint *c; - c = intel_special_constraints(event); - if (c) - return c; - if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { if ((event->hw.config & c->cmask) == c->code) @@ -2257,148 +1221,6 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event return &unconstrained; } -/* - * AMD64 events are detected based on their event codes. - */ -static inline int amd_is_nb_event(struct hw_perf_event *hwc) -{ - return (hwc->config & 0xe0) == 0xe0; -} - -static void amd_put_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct amd_nb *nb = cpuc->amd_nb; - int i; - - /* - * only care about NB events - */ - if (!(nb && amd_is_nb_event(hwc))) - return; - - /* - * need to scan whole list because event may not have - * been assigned during scheduling - * - * no race condition possible because event can only - * be removed on one CPU at a time AND PMU is disabled - * when we come here - */ - for (i = 0; i < x86_pmu.num_events; i++) { - if (nb->owners[i] == event) { - cmpxchg(nb->owners+i, event, NULL); - break; - } - } -} - - /* - * AMD64 NorthBridge events need special treatment because - * counter access needs to be synchronized across all cores - * of a package. Refer to BKDG section 3.12 - * - * NB events are events measuring L3 cache, Hypertransport - * traffic. They are identified by an event code >= 0xe00. - * They measure events on the NorthBride which is shared - * by all cores on a package. NB events are counted on a - * shared set of counters. When a NB event is programmed - * in a counter, the data actually comes from a shared - * counter. Thus, access to those counters needs to be - * synchronized. - * - * We implement the synchronization such that no two cores - * can be measuring NB events using the same counters. Thus, - * we maintain a per-NB allocation table. The available slot - * is propagated using the event_constraint structure. - * - * We provide only one choice for each NB event based on - * the fact that only NB events have restrictions. Consequently, - * if a counter is available, there is a guarantee the NB event - * will be assigned to it. If no slot is available, an empty - * constraint is returned and scheduling will eventually fail - * for this event. - * - * Note that all cores attached the same NB compete for the same - * counters to host NB events, this is why we use atomic ops. Some - * multi-chip CPUs may have more than one NB. - * - * Given that resources are allocated (cmpxchg), they must be - * eventually freed for others to use. This is accomplished by - * calling amd_put_event_constraints(). - * - * Non NB events are not impacted by this restriction. - */ -static struct event_constraint * -amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct amd_nb *nb = cpuc->amd_nb; - struct perf_event *old = NULL; - int max = x86_pmu.num_events; - int i, j, k = -1; - - /* - * if not NB event or no NB, then no constraints - */ - if (!(nb && amd_is_nb_event(hwc))) - return &unconstrained; - - /* - * detect if already present, if so reuse - * - * cannot merge with actual allocation - * because of possible holes - * - * event can already be present yet not assigned (in hwc->idx) - * because of successive calls to x86_schedule_events() from - * hw_perf_group_sched_in() without hw_perf_enable() - */ - for (i = 0; i < max; i++) { - /* - * keep track of first free slot - */ - if (k == -1 && !nb->owners[i]) - k = i; - - /* already present, reuse */ - if (nb->owners[i] == event) - goto done; - } - /* - * not present, so grab a new slot - * starting either at: - */ - if (hwc->idx != -1) { - /* previous assignment */ - i = hwc->idx; - } else if (k != -1) { - /* start from free slot found */ - i = k; - } else { - /* - * event not found, no slot found in - * first pass, try again from the - * beginning - */ - i = 0; - } - j = i; - do { - old = cmpxchg(nb->owners+i, NULL, event); - if (!old) - break; - if (++i == max) - i = 0; - } while (i != j); -done: - if (!old) - return &nb->event_constraints[i]; - - return &emptyconstraint; -} - static int x86_event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx) { @@ -2509,335 +1331,9 @@ undo: return ret; } -static __read_mostly struct notifier_block perf_event_nmi_notifier = { - .notifier_call = perf_event_nmi_handler, - .next = NULL, - .priority = 1 -}; - -static __initconst struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = x86_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_event, - .disable = p6_pmu_disable_event, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_events = 2, - /* - * Events have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a event for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .event_bits = 32, - .event_mask = (1ULL << 32) - 1, - .get_event_constraints = intel_get_event_constraints, - .event_constraints = intel_p6_event_constraints -}; - -static __initconst struct x86_pmu core_pmu = { - .name = "core", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .get_event_constraints = intel_get_event_constraints, - .event_constraints = intel_core_event_constraints, -}; - -static __initconst struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_event, - .disable = intel_pmu_disable_event, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, - .get_event_constraints = intel_get_event_constraints -}; - -static __initconst struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints -}; - -static __init int p6_pmu_init(void) -{ - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - case 9: - case 13: - /* Pentium M */ - break; - default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } - - x86_pmu = p6_pmu; - - return 0; -} - -static __init int intel_pmu_init(void) -{ - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } - } - - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; - - version = eax.split.version_id; - if (version < 2) - x86_pmu = core_pmu; - else - x86_pmu = intel_pmu; - - x86_pmu.version = version; - x86_pmu.num_events = eax.split.num_events; - x86_pmu.event_bits = eax.split.bit_width; - x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; - - /* - * Quirk: v2 perfmon does not report fixed-purpose events, so - * assume at least 3 events: - */ - if (version > 1) - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); - - /* - * Install the hw-cache-events table: - */ - switch (boot_cpu_data.x86_model) { - case 14: /* 65 nm core solo/duo, "Yonah" */ - pr_cont("Core events, "); - break; - - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - x86_pmu.event_constraints = intel_core2_event_constraints; - pr_cont("Core2 events, "); - break; - - case 26: /* 45 nm nehalem, "Bloomfield" */ - case 30: /* 45 nm nehalem, "Lynnfield" */ - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - x86_pmu.event_constraints = intel_nehalem_event_constraints; - pr_cont("Nehalem/Corei7 events, "); - break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("Atom events, "); - break; - - case 37: /* 32 nm nehalem, "Clarkdale" */ - case 44: /* 32 nm nehalem, "Gulftown" */ - memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - x86_pmu.event_constraints = intel_westmere_event_constraints; - pr_cont("Westmere events, "); - break; - default: - /* - * default constraints for v2 and up - */ - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("generic architected perfmon, "); - } - return 0; -} - -static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) -{ - struct amd_nb *nb; - int i; - - nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); - if (!nb) - return NULL; - - memset(nb, 0, sizeof(*nb)); - nb->nb_id = nb_id; - - /* - * initialize all possible NB constraints - */ - for (i = 0; i < x86_pmu.num_events; i++) { - set_bit(i, nb->event_constraints[i].idxmsk); - nb->event_constraints[i].weight = 1; - } - return nb; -} - -static void amd_pmu_cpu_online(int cpu) -{ - struct cpu_hw_events *cpu1, *cpu2; - struct amd_nb *nb = NULL; - int i, nb_id; - - if (boot_cpu_data.x86_max_cores < 2) - return; - - /* - * function may be called too early in the - * boot process, in which case nb_id is bogus - */ - nb_id = amd_get_nb_id(cpu); - if (nb_id == BAD_APICID) - return; - - cpu1 = &per_cpu(cpu_hw_events, cpu); - cpu1->amd_nb = NULL; - - raw_spin_lock(&amd_nb_lock); - - for_each_online_cpu(i) { - cpu2 = &per_cpu(cpu_hw_events, i); - nb = cpu2->amd_nb; - if (!nb) - continue; - if (nb->nb_id == nb_id) - goto found; - } - - nb = amd_alloc_nb(cpu, nb_id); - if (!nb) { - pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); - raw_spin_unlock(&amd_nb_lock); - return; - } -found: - nb->refcnt++; - cpu1->amd_nb = nb; - - raw_spin_unlock(&amd_nb_lock); -} - -static void amd_pmu_cpu_offline(int cpu) -{ - struct cpu_hw_events *cpuhw; - - if (boot_cpu_data.x86_max_cores < 2) - return; - - cpuhw = &per_cpu(cpu_hw_events, cpu); - - raw_spin_lock(&amd_nb_lock); - - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); - - cpuhw->amd_nb = NULL; - - raw_spin_unlock(&amd_nb_lock); -} - -static __init int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; - - x86_pmu = amd_pmu; - - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - /* - * explicitly initialize the boot cpu, other cpus will get - * the cpu hotplug callbacks from smp_init() - */ - amd_pmu_cpu_online(smp_processor_id()); - return 0; -} +#include "perf_event_amd.c" +#include "perf_event_p6.c" +#include "perf_event_intel.c" static void __init pmu_check_apic(void) { diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c new file mode 100644 index 000000000000..6d28e08563e8 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -0,0 +1,416 @@ +#ifdef CONFIG_CPU_SUP_AMD + +static raw_spinlock_t amd_nb_lock; + +static __initconst u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +static u64 amd_pmu_raw_event(u64 hw_event) +{ +#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_REG_MASK) + + return hw_event & K7_EVNTSEL_MASK; +} + +/* + * AMD64 events are detected based on their event codes. + */ +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * only care about NB events + */ + if (!(nb && amd_is_nb_event(hwc))) + return; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_events; i++) { + if (nb->owners[i] == event) { + cmpxchg(nb->owners+i, event, NULL); + break; + } + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling amd_put_event_constraints(). + * + * Non NB events are not impacted by this restriction. + */ +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old = NULL; + int max = x86_pmu.num_events; + int i, j, k = -1; + + /* + * if not NB event or no NB, then no constraints + */ + if (!(nb && amd_is_nb_event(hwc))) + return &unconstrained; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for (i = 0; i < max; i++) { + /* + * keep track of first free slot + */ + if (k == -1 && !nb->owners[i]) + k = i; + + /* already present, reuse */ + if (nb->owners[i] == event) + goto done; + } + /* + * not present, so grab a new slot + * starting either at: + */ + if (hwc->idx != -1) { + /* previous assignment */ + i = hwc->idx; + } else if (k != -1) { + /* start from free slot found */ + i = k; + } else { + /* + * event not found, no slot found in + * first pass, try again from the + * beginning + */ + i = 0; + } + j = i; + do { + old = cmpxchg(nb->owners+i, NULL, event); + if (!old) + break; + if (++i == max) + i = 0; + } while (i != j); +done: + if (!old) + return &nb->event_constraints[i]; + + return &emptyconstraint; +} + +static __initconst struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints +}; + +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +{ + struct amd_nb *nb; + int i; + + nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + if (!nb) + return NULL; + + memset(nb, 0, sizeof(*nb)); + nb->nb_id = nb_id; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_events; i++) { + set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static void amd_pmu_cpu_online(int cpu) +{ + struct cpu_hw_events *cpu1, *cpu2; + struct amd_nb *nb = NULL; + int i, nb_id; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + /* + * function may be called too early in the + * boot process, in which case nb_id is bogus + */ + nb_id = amd_get_nb_id(cpu); + if (nb_id == BAD_APICID) + return; + + cpu1 = &per_cpu(cpu_hw_events, cpu); + cpu1->amd_nb = NULL; + + raw_spin_lock(&amd_nb_lock); + + for_each_online_cpu(i) { + cpu2 = &per_cpu(cpu_hw_events, i); + nb = cpu2->amd_nb; + if (!nb) + continue; + if (nb->nb_id == nb_id) + goto found; + } + + nb = amd_alloc_nb(cpu, nb_id); + if (!nb) { + pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); + raw_spin_unlock(&amd_nb_lock); + return; + } +found: + nb->refcnt++; + cpu1->amd_nb = nb; + + raw_spin_unlock(&amd_nb_lock); +} + +static void amd_pmu_cpu_offline(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock(&amd_nb_lock); + + if (--cpuhw->amd_nb->refcnt == 0) + kfree(cpuhw->amd_nb); + + cpuhw->amd_nb = NULL; + + raw_spin_unlock(&amd_nb_lock); +} + +static __init int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + /* + * explicitly initialize the boot cpu, other cpus will get + * the cpu hotplug callbacks from smp_init() + */ + amd_pmu_cpu_online(smp_processor_id()); + return 0; +} + +#else /* CONFIG_CPU_SUP_AMD */ + +static int amd_pmu_init(void) +{ + return 0; +} + +static void amd_pmu_cpu_online(int cpu) +{ +} + +static void amd_pmu_cpu_offline(int cpu) +{ +} + +#endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c new file mode 100644 index 000000000000..cf6590cf4a5f --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -0,0 +1,971 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static struct event_constraint intel_core_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +static __initconst u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 hw_event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (INTEL_ARCH_EVTSEL_MASK | \ + INTEL_ARCH_UNIT_MASK | \ + INTEL_ARCH_EDGE_MASK | \ + INTEL_ARCH_INV_MASK | \ + INTEL_ARCH_CNT_MASK) + + return hw_event & CORE_EVNTSEL_MASK; +} + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); +} + +static void intel_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_drain_bts_buffer(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + + data.period = event->hw.last_period; + data.addr = 0; + data.raw = NULL; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static inline void +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + intel_pmu_drain_bts_buffer(); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_event(hwc, idx); +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + __x86_pmu_enable_event(hwc, idx); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +static int intel_pmu_save_and_restart(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + int ret; + + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); + + return ret; +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + unsigned long flags; + int idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 ack, status; + + data.addr = 0; + data.raw = NULL; + + cpuc = &__get_cpu_var(cpu_hw_events); + + perf_disable(); + intel_pmu_drain_bts_buffer(); + status = intel_pmu_get_status(); + if (!status) { + perf_enable(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + perf_enable(); + return 1; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + data.period = event->hw.last_period; + + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + + perf_enable(); + + return 1; +} + +static struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + +static struct event_constraint * +intel_special_constraints(struct perf_event *event) +{ + unsigned int hw_event; + + hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (event->hw.sample_period == 1))) { + + return &bts_constraint; + } + return NULL; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + c = intel_special_constraints(event); + if (c) + return c; + + return x86_get_event_constraints(cpuc, event); +} + +static __initconst struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_core_event_constraints, +}; + +static __initconst struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, + .get_event_constraints = intel_get_event_constraints +}; + +static __init int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; + + x86_pmu.version = version; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + if (version > 1) + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_core2_event_constraints; + pr_cont("Core2 events, "); + break; + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_nehalem_event_constraints; + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("Atom events, "); + break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + pr_cont("Westmere events, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + } + return 0; +} + +#else /* CONFIG_CPU_SUP_INTEL */ + +static int intel_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c new file mode 100644 index 000000000000..1ca5ba078afd --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -0,0 +1,157 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 hw_event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_REG_MASK) + + return hw_event & P6_EVNTSEL_MASK; +} + +static struct event_constraint p6_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; + +static void p6_pmu_disable_all(void) +{ + u64 val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void p6_pmu_enable_all(void) +{ + unsigned long val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static inline void +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static __initconst struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = x86_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_events = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = p6_event_constraints, +}; + +static __init int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ -- cgit v1.2.2 From 1dd2980d990068e20045b90c424518cc7f3657ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <peterz@infradead.org> Date: Fri, 26 Feb 2010 17:07:35 +0100 Subject: perf_event, amd: Fix spinlock initialization Avoid kernels from exploding on AMD machines when they have any lock debugging bits enabled. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 6d28e08563e8..8f3dbfda3c4f 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,6 +1,6 @@ #ifdef CONFIG_CPU_SUP_AMD -static raw_spinlock_t amd_nb_lock; +static DEFINE_RAW_SPINLOCK(amd_nb_lock); static __initconst u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] -- cgit v1.2.2 From 78c06176466cbd1b3f0f67709d3023c40dbebcbd Mon Sep 17 00:00:00 2001 From: Russ Anderson <rja@sgi.com> Date: Fri, 26 Feb 2010 10:49:12 -0600 Subject: x86: Enable NMI on all cpus on UV Enable NMI on all cpus in UV system and add an NMI handler to dump_stack on each cpu. By default on x86 all the cpus except the boot cpu have NMI masked off. This patch enables NMI on all cpus in UV system and adds an NMI handler to dump_stack on each cpu. This way if a system hangs we can NMI the machine and get a backtrace from all the cpus. Version 2: Use x86_platform driver mechanism for nmi init, per Ingo's suggestion. Version 3: Clean up Ingo's nits. Signed-off-by: Russ Anderson <rja@sgi.com> LKML-Reference: <20100226164912.GA24439@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/x2apic_uv_x.c | 44 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 3 +++ 3 files changed, 48 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 6ef2899eb861..4b8dbb256147 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -21,6 +21,7 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/pci.h> +#include <linux/kdebug.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -41,6 +42,7 @@ static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +static DEFINE_SPINLOCK(uv_nmi_lock); static inline bool is_GRU_range(u64 start, u64 end) { @@ -74,6 +76,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (!strcmp(oem_id, "SGI")) { nodeid = early_get_nodeid(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) @@ -596,6 +599,46 @@ void __cpuinit uv_cpu_init(void) set_x2apic_extra_bits(uv_hub_info->pnode); } +/* + * When NMI is received, print a stack trace. + */ +int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) +{ + if (reason != DIE_NMI_IPI) + return NOTIFY_OK; + /* + * Use a lock so only one cpu prints at a time + * to prevent intermixed output. + */ + spin_lock(&uv_nmi_lock); + pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + dump_stack(); + spin_unlock(&uv_nmi_lock); + + return NOTIFY_STOP; +} + +static struct notifier_block uv_dump_stack_nmi_nb = { + .notifier_call = uv_handle_nmi +}; + +void uv_register_nmi_notifier(void) +{ + if (register_die_notifier(&uv_dump_stack_nmi_nb)) + printk(KERN_WARNING "UV NMI handler failed to register\n"); +} + +void uv_nmi_init(void) +{ + unsigned int value; + + /* + * Unmask NMI on all cpus + */ + value = apic_read(APIC_LVT1) | APIC_DM_NMI; + value &= ~APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} void __init uv_system_init(void) { @@ -717,6 +760,7 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); + uv_register_nmi_notifier(); proc_mkdir("sgi_uv", NULL); /* register Legacy VGA I/O redirection handler */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..838a118876c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -320,6 +320,7 @@ notrace static void __cpuinit start_secondary(void *unused) unlock_vector_lock(); ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + x86_platform.nmi_init(); /* enable local interrupts */ local_irq_enable(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ccd179dec36e..ee5746c94628 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -76,10 +76,13 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, }; +static void default_nmi_init(void) { }; + struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, + .nmi_init = default_nmi_init }; -- cgit v1.2.2 From 21c2fd9970cc8e457058f84016450a2e9876125e Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 26 Feb 2010 11:17:16 +0100 Subject: x86: apic: Fix mismerge, add arch_probe_nr_irqs() again Merge commit aef55d4922 mis-merged io_apic.c so we lost the arch_probe_nr_irqs() method. This caused subtle boot breakages (udev confusion likely due to missing drivers) with certain configs. Cc: H. Peter Anvin <hpa@zytor.com> Cc: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <20100207210250.GB8256@jenkins.home.ifup.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/io_apic.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 979589881c80..72ac2a332993 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3876,6 +3876,28 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +#ifdef CONFIG_SPARSE_IRQ +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = nr_irqs_gsi + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) + /* + * for MSI and HT dyn irq + */ + nr += nr_irqs_gsi * 16; +#endif + if (nr < nr_irqs) + nr_irqs = nr; + + return 0; +} +#endif + static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { -- cgit v1.2.2 From 3d083407a16698de86b42aee0da2ffb280b5cb7e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Sat, 27 Feb 2010 17:24:15 +0100 Subject: x86/hw-breakpoints: Remove the name field Remove the name field from the arch_hw_breakpoint. We never deal with target symbols in the arch level, neither do we need to ever store it. It's a legacy for the previous version of the x86 breakpoint backend. Let's remove it. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: K.Prasad <prasad@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/hw_breakpoint.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index dca2802c666f..41e08dff0161 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -343,13 +343,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. -- cgit v1.2.2 From 3249b7e1df6380e9d7bb3238f64f445bf614f787 Mon Sep 17 00:00:00 2001 From: Ian Campbell <ian.campbell@citrix.com> Date: Fri, 26 Feb 2010 17:16:01 +0000 Subject: x86, vmi: Disable highmem PTE allocation even when CONFIG_HIGHPTE=y Preventing HIGHPTE allocations under VMI will allow us to remove the kmap_atomic_pte paravirt op. Signed-off-by: Ian Campbell <ian.campbell@citrix.com> LKML-Reference: <1267204562-11844-2-git-send-email-ian.campbell@citrix.com> Acked-by: Alok Kataria <akataria@vmware.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/vmi_32.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index d430e4c30193..58aca86193e5 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -33,6 +33,7 @@ #include <asm/fixmap.h> #include <asm/apicdef.h> #include <asm/apic.h> +#include <asm/pgalloc.h> #include <asm/processor.h> #include <asm/timer.h> #include <asm/vmi_time.h> @@ -272,19 +273,11 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) void *va = kmap_atomic(page, type); /* - * Internally, the VMI ROM must map virtual addresses to physical - * addresses for processing MMU updates. By the time MMU updates - * are issued, this information is typically already lost. - * Fortunately, the VMI provides a cache of mapping slots for active - * page tables. - * - * We use slot zero for the linear mapping of physical memory, and - * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1. - * - * args: SLOT VA COUNT PFN + * We disable highmem allocations for page tables so we should never + * see any calls to kmap_atomic_pte on a highmem page. */ - BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); + + BUG_ON(PageHighmem(page)); return va; } @@ -640,6 +633,12 @@ static inline int __init activate_vmi(void) u64 reloc; const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + /* + * Prevent page tables from being allocated in highmem, even if + * CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + if (call_vrom_func(vmi_rom, vmi_init) != 0) { printk(KERN_ERR "VMI ROM failed to initialize!"); return 0; -- cgit v1.2.2 From dad52fc01161afcb8798c609e009aed4d104927f Mon Sep 17 00:00:00 2001 From: Ian Campbell <ian.campbell@citrix.com> Date: Fri, 26 Feb 2010 17:16:02 +0000 Subject: x86, paravirt: Remove kmap_atomic_pte paravirt op. Now that both Xen and VMI disable allocations of PTE pages from high memory this paravirt op serves no further purpose. This effectively reverts ce6234b5 "add kmap_atomic_pte for mapping highpte pages". Signed-off-by: Ian Campbell <ian.campbell@citrix.com> LKML-Reference: <1267204562-11844-3-git-send-email-ian.campbell@citrix.com> Acked-by: Alok Kataria <akataria@vmware.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/paravirt.c | 4 ---- arch/x86/kernel/vmi_32.c | 20 -------------------- 2 files changed, 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b1739d16310..1db183ed7c01 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, -#endif - #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 58aca86193e5..7dd599deca4a 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -267,22 +267,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_HIGHPTE -static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) -{ - void *va = kmap_atomic(page, type); - - /* - * We disable highmem allocations for page tables so we should never - * see any calls to kmap_atomic_pte on a highmem page. - */ - - BUG_ON(PageHighmem(page)); - - return va; -} -#endif - static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -777,10 +761,6 @@ static inline int __init activate_vmi(void) /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); -#ifdef CONFIG_HIGHPTE - if (vmi_ops.set_linear_mapping) - pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; -#endif /* * These MUST always be patched. Don't support indirect jumps -- cgit v1.2.2 From fad539956c9e69749a03f7817d22d1bab87657bf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Sun, 28 Feb 2010 01:06:34 -0800 Subject: x86: Fix out of order of gsi Iranna D Ankad reported that IBM x3950 systems have boot problems after this commit: | | commit b9c61b70075c87a8612624736faf4a2de5b1ed30 | | x86/pci: update pirq_enable_irq() to setup io apic routing | The problem is that with the patch, the machine freezes when console=ttyS0,... kernel serial parameter is passed. It seem to freeze at DVD initialization and the whole problem seem to be DVD/pata related, but somehow exposed through the serial parameter. Such apic problems can expose really weird behavior: ACPI: IOAPIC (id[0x10] address[0xfecff000] gsi_base[0]) IOAPIC[0]: apic_id 16, version 0, address 0xfecff000, GSI 0-2 ACPI: IOAPIC (id[0x0f] address[0xfec00000] gsi_base[3]) IOAPIC[1]: apic_id 15, version 0, address 0xfec00000, GSI 3-38 ACPI: IOAPIC (id[0x0e] address[0xfec01000] gsi_base[39]) IOAPIC[2]: apic_id 14, version 0, address 0xfec01000, GSI 39-74 ACPI: INT_SRC_OVR (bus 0 bus_irq 1 global_irq 4 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 5 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 3 global_irq 6 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 4 global_irq 7 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 6 global_irq 9 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 7 global_irq 10 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 8 global_irq 11 low edge) ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 12 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 12 global_irq 15 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 13 global_irq 16 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 14 global_irq 17 low edge) ACPI: INT_SRC_OVR (bus 0 bus_irq 15 global_irq 18 dfl dfl) It turns out that the system has three io apic controllers, but boot ioapic routing is in the second one, and that gsi_base is not 0 - it is using a bunch of INT_SRC_OVR... So these recent changes: 1. one set routing for first io apic controller 2. assume irq = gsi ... will break that system. So try to remap those gsis, need to seperate boot_ioapic_idx detection out of enable_IO_APIC() and call them early. So introduce boot_ioapic_idx, and remap_ioapic_gsi()... -v2: shift gsi with delta instead of gsi_base of boot_ioapic_idx -v3: double check with find_isa_irq_apic(0, mp_INT) to get right boot_ioapic_idx -v4: nr_legacy_irqs -v5: add print out for boot_ioapic_idx, and also make it could be applied for current kernel and previous kernel -v6: add bus_irq, in acpi_sci_ioapic_setup, so can get overwride for sci right mapping... -v7: looks like pnpacpi get irq instead of gsi, so need to revert them back... -v8: split into two patches -v9: according to Eric, use fixed 16 for shifting instead of remap -v10: still need to touch rsparser.c -v11: just revert back to way Eric suggest... anyway the ioapic in first ioapic is blocked by second... -v12: two patches, this one will add more loop but check apic_id and irq > 16 Reported-by: Iranna D Ankad <iranna.ankad@in.ibm.com> Bisected-by: Iranna D Ankad <iranna.ankad@in.ibm.com> Tested-by: Gary Hade <garyhade@us.ibm.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Thomas Renninger <trenn@suse.de> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Suresh Siddha <suresh.b.siddha@intel.com> Cc: len.brown@intel.com LKML-Reference: <4B8A321A.1000008@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/io_apic.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 72ac2a332993..97e1e3ec2edf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1475,7 +1475,7 @@ static struct { static void __init setup_IO_APIC_irqs(void) { - int apic_id = 0, pin, idx, irq; + int apic_id, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; @@ -1483,14 +1483,7 @@ static void __init setup_IO_APIC_irqs(void) apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); -#ifdef CONFIG_ACPI - if (!acpi_disabled && acpi_ioapic) { - apic_id = mp_find_ioapic(0); - if (apic_id < 0) - apic_id = 0; - } -#endif - + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); if (idx == -1) { @@ -1512,6 +1505,9 @@ static void __init setup_IO_APIC_irqs(void) irq = pin_2_irq(idx, apic_id, pin); + if ((apic_id > 0) && (irq > 16)) + continue; + /* * Skip the timer IRQ if there's a quirk handler * installed and if it returns 1: @@ -4105,27 +4101,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic = 0, irq, irq_entry; + int pin, ioapic, irq, irq_entry; struct irq_desc *desc; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; -#ifdef CONFIG_ACPI - if (!acpi_disabled && acpi_ioapic) { - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - ioapic = 0; - } -#endif - + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; irq = pin_2_irq(irq_entry, ioapic, pin); + if ((ioapic > 0) && (irq > 16)) + continue; + desc = irq_to_desc(irq); /* -- cgit v1.2.2 From 1e259e0a9982078896f3404240096cbea01daca4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Sun, 28 Feb 2010 20:51:15 +0100 Subject: hw-breakpoints: Remove stub unthrottle callback We support event unthrottling in breakpoint events. It means that if we have more than sysctl_perf_event_sample_rate/HZ, perf will throttle, ignoring subsequent events until the next tick. So if ptrace exceeds this max rate, it will omit events, which breaks the ptrace determinism that is supposed to report every triggered breakpoints. This is likely to happen if we set sysctl_perf_event_sample_rate to 1. This patch removes support for unthrottling in breakpoint events to break throttling and restore ptrace determinism. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: 2.6.33.x <stable@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: K.Prasad <prasad@linux.vnet.ibm.com> Cc: Paul Mackerras <paulus@samba.org> --- arch/x86/kernel/hw_breakpoint.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index bb6006e3e295..1e8ceadc0d6a 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -531,8 +531,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } - -void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) -{ - /* TODO */ -} -- cgit v1.2.2 From 339d3261aa3eb0e12f68ef868e042c1ca03628f7 Mon Sep 17 00:00:00 2001 From: Julia Lawall <julia@diku.dk> Date: Sat, 6 Feb 2010 09:42:39 +0100 Subject: x86/amd-iommu: Remove double NULL check in check_device dev was tested just above, so drop the second test. Signed-off-by: Julia Lawall <julia@diku.dk> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index adb0ba025702..2c4a5012038e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -118,7 +118,7 @@ static bool check_device(struct device *dev) return false; /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) + if (dev->bus != &pci_bus_type) return false; devid = get_device_id(dev); -- cgit v1.2.2 From 5d214fe6e808a8caa9cb6f610c0190d3f50ac570 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Mon, 8 Feb 2010 14:44:49 +0100 Subject: x86/amd-iommu: Protect IOMMU-API map/unmap path This patch introduces a mutex to lock page table updates in the IOMMU-API path. We can't use the spin_lock here because this patch might sleep. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 2c4a5012038e..b97f2f1c449a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2327,6 +2327,7 @@ static struct protection_domain *protection_domain_alloc(void) return NULL; spin_lock_init(&domain->lock); + mutex_init(&domain->api_lock); domain->id = domain_id_alloc(); if (!domain->id) goto out_err; @@ -2456,6 +2457,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom, iova &= PAGE_MASK; paddr &= PAGE_MASK; + mutex_lock(&domain->api_lock); + for (i = 0; i < npages; ++i) { ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); if (ret) @@ -2465,6 +2468,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom, paddr += PAGE_SIZE; } + mutex_unlock(&domain->api_lock); + return 0; } @@ -2477,12 +2482,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova &= PAGE_MASK; + mutex_lock(&domain->api_lock); + for (i = 0; i < npages; ++i) { iommu_unmap_page(domain, iova, PM_MAP_4k); iova += PAGE_SIZE; } iommu_flush_tlb_pde(domain); + + mutex_unlock(&domain->api_lock); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, -- cgit v1.2.2 From 04e856c072b84042bb56c487c2868638bb3f78db Mon Sep 17 00:00:00 2001 From: Chris Wright <chrisw@sous-sol.org> Date: Wed, 17 Feb 2010 08:51:20 -0800 Subject: x86/amd-iommu: Pt mode fix for domain_destroy After a guest is shutdown, assigned devices are not properly returned to the pt domain. This can leave the device using stale cached IOMMU data, and result in a non-functional device after it's re-bound to the host driver. For example, I see this upon rebinding: AMD-Vi: Event logged [IO_PAGE_FAULT device=02:00.0 domain=0x0000 address=0x000000007e2a8000 flags=0x0050] AMD-Vi: Event logged [IO_PAGE_FAULT device=02:00.0 domain=0x0000 address=0x000000007e2a8040 flags=0x0050] AMD-Vi: Event logged [IO_PAGE_FAULT device=02:00.0 domain=0x0000 address=0x000000007e2a8080 flags=0x0050] AMD-Vi: Event logged [IO_PAGE_FAULT device=02:00.0 domain=0x0000 address=0x000000007e2a80c0 flags=0x0050] 0000:02:00.0: eth2: Detected Hardware Unit Hang: ... The amd_iommu_destroy_domain() function calls do_detach() which doesn't reattach the pt domain to the device. Use __detach_device() instead. Cc: stable@kernel.org Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b97f2f1c449a..0c0425436a73 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2298,7 +2298,7 @@ static void cleanup_domain(struct protection_domain *domain) list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { struct device *dev = dev_data->dev; - do_detach(dev); + __detach_device(dev); atomic_set(&dev_data->bind, 0); } -- cgit v1.2.2 From 3551a708f35fc712af43aeb7f541512c5cfc4936 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Mon, 1 Mar 2010 13:52:19 +0100 Subject: x86/amd-iommu: Report errors in acpi parsing functions upstream Since acpi_table_parse ignores the return values of the parsing function this patch introduces a workaround and reports these errors upstream via a global variable. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu_init.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9dc91b431470..feaf47184900 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -138,9 +138,9 @@ int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; /* - * Set to true if ACPI table parsing and hardware intialization went properly + * The ACPI table parsing functions set this variable on an error */ -static bool amd_iommu_initialized; +static int __initdata amd_iommu_init_err; /* * List of protection domains - used during resume @@ -391,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) */ for (i = 0; i < table->length; ++i) checksum += p[i]; - if (checksum != 0) + if (checksum != 0) { /* ACPI table corrupt */ - return -ENODEV; + amd_iommu_init_err = -ENODEV; + return 0; + } p += IVRS_HEADER_LENGTH; @@ -920,11 +922,16 @@ static int __init init_iommu_all(struct acpi_table_header *table) h->mmio_phys); iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); - if (iommu == NULL) - return -ENOMEM; + if (iommu == NULL) { + amd_iommu_init_err = -ENOMEM; + return 0; + } + ret = init_iommu_one(iommu, h); - if (ret) - return ret; + if (ret) { + amd_iommu_init_err = ret; + return 0; + } break; default: break; @@ -934,8 +941,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) } WARN_ON(p != end); - amd_iommu_initialized = true; - return 0; } @@ -1211,6 +1216,10 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; + ret = amd_iommu_init_err; + if (ret) + goto out; + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); @@ -1270,12 +1279,19 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; - if (!amd_iommu_initialized) + if (amd_iommu_init_err) { + ret = amd_iommu_init_err; goto free; + } if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; + if (amd_iommu_init_err) { + ret = amd_iommu_init_err; + goto free; + } + ret = sysdev_class_register(&amd_iommu_sysdev_class); if (ret) goto free; -- cgit v1.2.2 From bb1165d6882f423f90fc7007a88c6c993b7c2ac4 Mon Sep 17 00:00:00 2001 From: Robert Richter <robert.richter@amd.com> Date: Mon, 1 Mar 2010 14:21:23 +0100 Subject: perf, x86: rename macro in ARCH_PERFMON_EVENTSEL_ENABLE For consistency reasons this patch renames ARCH_PERFMON_EVENTSEL0_ENABLE to ARCH_PERFMON_EVENTSEL_ENABLE. The following is performed: $ sed -i -e s/ARCH_PERFMON_EVENTSEL0_ENABLE/ARCH_PERFMON_EVENTSEL_ENABLE/g \ arch/x86/include/asm/perf_event.h arch/x86/kernel/cpu/perf_event.c \ arch/x86/kernel/cpu/perf_event_p6.c \ arch/x86/kernel/cpu/perfctr-watchdog.c \ arch/x86/oprofile/op_model_amd.c arch/x86/oprofile/op_model_ppro.c Signed-off-by: Robert Richter <robert.richter@amd.com> --- arch/x86/kernel/cpu/perf_event.c | 8 ++++---- arch/x86/kernel/cpu/perf_event_p6.c | 8 ++++---- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9dddbc..6531b4bdb22d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -553,9 +553,9 @@ static void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(x86_pmu.eventsel + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -590,7 +590,7 @@ static void x86_pmu_enable_all(void) continue; val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -853,7 +853,7 @@ void hw_perf_enable(void) static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); + hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); } static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 1ca5ba078afd..a4e67b99d91c 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -62,7 +62,7 @@ static void p6_pmu_disable_all(void) /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(MSR_P6_EVNTSEL0, val); } @@ -72,7 +72,7 @@ static void p6_pmu_enable_all(void) /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(MSR_P6_EVNTSEL0, val); } @@ -83,7 +83,7 @@ p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) u64 val = P6_NOP_EVENT; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)checking_wrmsrl(hwc->config_base + idx, val); } @@ -95,7 +95,7 @@ static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) val = hwc->config; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)checking_wrmsrl(hwc->config_base + idx, val); } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 74f4e85a5727..fb329e9f8494 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -680,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) cpu_nmi_set_wd_enabled(); apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; -- cgit v1.2.2 From be43f83dada2cf0e9e01c9a0ba42977c5bd70f9d Mon Sep 17 00:00:00 2001 From: Sheng Yang <sheng@linux.intel.com> Date: Fri, 18 Dec 2009 16:48:45 +0800 Subject: x86: Raise vsyscall priority on hotplug notifier chain KVM need vsyscall_init() to initialize MSR_TSC_AUX before it read the value. Per Avi's suggestion, this patch raised vsyscall priority on hotplug notifier chain, to 30. CC: Ingo Molnar <mingo@elte.hu> CC: linux-kernel@vger.kernel.org Signed-off-by: Sheng Yang <sheng@linux.intel.com> Signed-off-by: Avi Kivity <avi@redhat.com> --- arch/x86/kernel/vsyscall_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e5872ff0..1c0c6ab9c60f 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -301,7 +301,8 @@ static int __init vsyscall_init(void) register_sysctl_table(kernel_root_table2); #endif on_each_cpu(cpu_vsyscall_init, NULL, 1); - hotcpu_notifier(cpu_vsyscall_notifier, 0); + /* notifier priority > KVM */ + hotcpu_notifier(cpu_vsyscall_notifier, 30); return 0; } -- cgit v1.2.2 From 14be1f7454ea96ee614467a49cf018a1a383b189 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich <sivanich@sgi.com> Date: Mon, 1 Mar 2010 11:48:15 -0600 Subject: x86: Fix sched_clock_cpu for systems with unsynchronized TSC On UV systems, the TSC is not synchronized across blades. The sched_clock_cpu() function is returning values that can go backwards (I've seen as much as 8 seconds) when switching between cpus. As each cpu comes up, early_init_intel() will currently set the sched_clock_stable flag true. When mark_tsc_unstable() runs, it clears the flag, but this only occurs once (the first time a cpu comes up whose TSC is not synchronized with cpu 0). After this, early_init_intel() will set the flag again as the next cpu comes up. Only set sched_clock_stable if tsc has not been marked unstable. Signed-off-by: Dimitri Sivanich <sivanich@sgi.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20100301174815.GC8224@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/intel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 879666f4d871..7e1cca13af35 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - sched_clock_stable = 1; + if (!check_tsc_unstable()) + sched_clock_stable = 1; } /* -- cgit v1.2.2 From 320ebf09cbb6d01954c9a060266aa8e0d27f4638 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Tue, 2 Mar 2010 12:35:37 +0100 Subject: perf, x86: Restrict the ANY flag The ANY flag can show SMT data of another task (like 'top'), so we want to disable it when system-wide profiling is disabled. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6531b4bdb22d..aab2e1ce9dee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -503,6 +503,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); + if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && + perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; return 0; } -- cgit v1.2.2 From b622d644c7d61a5cb95b74e7b143c263bed21f0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Mon, 1 Feb 2010 15:36:30 +0100 Subject: perf_events, x86: Fixup fixed counter constraints Patch 1da53e0230 ("perf_events, x86: Improve x86 event scheduling") lost us one of the fixed purpose counters and then ed8777fc13 ("perf_events, x86: Fix event constraint masks") broke it even further. Widen the fixed event mask to event+umask and specify the full config for each of the 3 fixed purpose counters. Then let the init code fill out the placement for the GP regs based on the cpuid info. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 25 ++++++++++++++++++------- arch/x86/kernel/cpu/perf_event_intel.c | 31 +++++++++++++++++++++---------- 2 files changed, 39 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index aab2e1ce9dee..bfc43fa208bc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -73,10 +73,10 @@ struct debug_store { struct event_constraint { union { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 idxmsk64[1]; + u64 idxmsk64; }; - int code; - int cmask; + u64 code; + u64 cmask; int weight; }; @@ -103,7 +103,7 @@ struct cpu_hw_events { }; #define __EVENT_CONSTRAINT(c, n, m, w) {\ - { .idxmsk64[0] = (n) }, \ + { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ @@ -116,7 +116,7 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) #define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) + EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) @@ -615,8 +615,8 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) bitmap_zero(used_mask, X86_PMC_IDX_MAX); for (i = 0; i < n; i++) { - constraints[i] = - x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + constraints[i] = c; } /* @@ -1350,6 +1350,7 @@ static void __init pmu_check_apic(void) void __init init_hw_perf_events(void) { + struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1398,6 +1399,16 @@ void __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0, x86_pmu.num_events); + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != INTEL_ARCH_FIXED_MASK) + continue; + + c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; + c->weight += x86_pmu.num_events; + } + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590cf4a5f..4fbdfe5708d9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,7 +1,7 @@ #ifdef CONFIG_CPU_SUP_INTEL /* - * Intel PerfMon v3. Used on Core2 and later. + * Intel PerfMon, used on Core and later. */ static const u64 intel_perfmon_event_map[] = { @@ -27,8 +27,14 @@ static struct event_constraint intel_core_event_constraints[] = static struct event_constraint intel_core2_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* + * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event + * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed + * ratio between these counters. + */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -37,14 +43,16 @@ static struct event_constraint intel_core2_event_constraints[] = INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_nehalem_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -58,8 +66,9 @@ static struct event_constraint intel_nehalem_event_constraints[] = static struct event_constraint intel_westmere_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -68,8 +77,9 @@ static struct event_constraint intel_westmere_event_constraints[] = static struct event_constraint intel_gen_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; @@ -935,7 +945,7 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; - case 28: + case 28: /* Atom */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -951,6 +961,7 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_westmere_event_constraints; pr_cont("Westmere events, "); break; + default: /* * default constraints for v2 and up -- cgit v1.2.2 From 29044ad1509ecc229f1d5a31aeed7a8dc61a71c4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Wed, 3 Mar 2010 02:25:22 +0100 Subject: x86/stacktrace: Don't dereference bad frame pointers Callers of a stacktrace might pass bad frame pointers. Those are usually checked for safety in stack walking helpers before any dereferencing, but this is not the case when we need to go through one more frame pointer that backlinks the irq stack to the previous one, as we don't have any reliable address boudaries to compare this frame pointer against. This raises crashes when we record callchains for ftrace events with perf because we don't use the right helpers to capture registers there. We get wrong frame pointers as we call task_pt_regs() even on kernel threads, which is a wrong thing as it gives us the initial state of any kernel threads freshly created. This is even not what we want for user tasks. What we want is a hot snapshot of registers when the ftrace event triggers, not the state before a task entered the kernel. This requires more thoughts to do it correctly though. So first put a guardian to ensure the given frame pointer can be dereferenced to avoid crashes. We'll think about how to fix the callers in a subsequent patch. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: 2.6.33.x <stable@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> --- arch/x86/kernel/dumpstack_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f5..a6c906c9b193 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -125,9 +125,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, { #ifdef CONFIG_FRAME_POINTER struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long next; - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) - return (unsigned long)frame->next_frame; + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { + if (!probe_kernel_address(&frame->next_frame, next)) + return next; + else + WARN_ONCE(1, "Perf: bad frame pointer = %p in " + "callchain\n", &frame->next_frame); + } #endif return bp; } -- cgit v1.2.2 From 3010673ef5f7bef4b4685566a0713de1b4306c93 Mon Sep 17 00:00:00 2001 From: Jacob Pan <jacob.jun.pan@linux.intel.com> Date: Tue, 2 Mar 2010 21:01:34 -0800 Subject: x86, mrst: Fix APB timer per cpu clockevent The current APB timer code incorrectly registers a static copy of the clockevent device for the boot CPU. The per cpu clockevent should be used instead. This bug was hidden by zero-initialized data; as such it did not get exposed in testing, but was discovered by code review. Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> LKML-Reference: <1267592494-7723-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apb_timer.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 83a345b0256c..6f27f8b75795 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -84,9 +84,10 @@ struct apbt_dev { int disable_apbt_percpu __cpuinitdata; +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); + #ifdef CONFIG_SMP static unsigned int apbt_num_timers_used; -static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); static struct apbt_dev *apbt_devs; #endif @@ -302,6 +303,7 @@ static void apbt_disable_int(int n) static int __init apbt_clockevent_register(void) { struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); if (mtmr == NULL) { @@ -329,22 +331,24 @@ static int __init apbt_clockevent_register(void) * global if not used for per cpu timer. */ apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + adev->num = smp_processor_id(); + memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); if (disable_apbt_percpu) { apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; - global_clock_event = &apbt_clockevent; + global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", global_clock_event->name); } if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - apbt_clockevent.name, &apbt_clockevent)) { + apbt_clockevent.name, adev)) { printk(KERN_ERR "Failed request IRQ for APBT%d\n", apbt_clockevent.irq); } - clockevents_register_device(&apbt_clockevent); + clockevents_register_device(&adev->evt); /* Start APBT 0 interrupts */ apbt_enable_int(APBT_CLOCKEVENT0_NUM); -- cgit v1.2.2 From c7bbf52aa4fa332b84c4f2bb33e69561ee6870b4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Wed, 3 Mar 2010 13:38:48 -0800 Subject: x86, mrst: Fix whitespace breakage in apb_timer.c Checkin bb24c4716185f6e116c440462c65c1f56649183b: "Moorestown APB system timer driver" suffered from severe whitespace damage in arch/x86/kernel/apb_timer.c due to using Microsoft Lookout to send a patch. Fix the whitespace breakage. Reported-by: Jacob Pan <jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/apb_timer.c | 1068 +++++++++++++++++++++---------------------- 1 file changed, 534 insertions(+), 534 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 6f27f8b75795..2afa27d01297 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -43,11 +43,11 @@ #include <asm/fixmap.h> #include <asm/apb_timer.h> -#define APBT_MASK CLOCKSOURCE_MASK(32) -#define APBT_SHIFT 22 -#define APBT_CLOCKEVENT_RATING 150 -#define APBT_CLOCKSOURCE_RATING 250 -#define APBT_MIN_DELTA_USEC 200 +#define APBT_MASK CLOCKSOURCE_MASK(32) +#define APBT_SHIFT 22 +#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKSOURCE_RATING 250 +#define APBT_MIN_DELTA_USEC 200 #define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) #define APBT_CLOCKEVENT0_NUM (0) @@ -65,21 +65,21 @@ static int phy_cs_timer_id; static uint64_t apbt_freq; static void apbt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt); + struct clock_event_device *evt); static int apbt_next_event(unsigned long delta, - struct clock_event_device *evt); + struct clock_event_device *evt); static cycle_t apbt_read_clocksource(struct clocksource *cs); static void apbt_restart_clocksource(void); struct apbt_dev { - struct clock_event_device evt; - unsigned int num; - int cpu; - unsigned int irq; - unsigned int tick; - unsigned int count; - unsigned int flags; - char name[10]; + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int tick; + unsigned int count; + unsigned int flags; + char name[10]; }; int disable_apbt_percpu __cpuinitdata; @@ -91,77 +91,77 @@ static unsigned int apbt_num_timers_used; static struct apbt_dev *apbt_devs; #endif -static inline unsigned long apbt_readl_reg(unsigned long a) +static inline unsigned long apbt_readl_reg(unsigned long a) { - return readl(apbt_virt_address + a); + return readl(apbt_virt_address + a); } static inline void apbt_writel_reg(unsigned long d, unsigned long a) { - writel(d, apbt_virt_address + a); + writel(d, apbt_virt_address + a); } static inline unsigned long apbt_readl(int n, unsigned long a) { - return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); + return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); } static inline void apbt_writel(int n, unsigned long d, unsigned long a) { - writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); + writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); } static inline void apbt_set_mapping(void) { - struct sfi_timer_table_entry *mtmr; - - if (apbt_virt_address) { - pr_debug("APBT base already mapped\n"); - return; - } - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return; - } - apbt_address = (unsigned long)mtmr->phys_addr; - if (!apbt_address) { - printk(KERN_WARNING "No timer base from SFI, use default\n"); - apbt_address = APBT_DEFAULT_BASE; - } - apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); - if (apbt_virt_address) { - pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ - (void *)apbt_address, (void *)apbt_virt_address); - } else { - pr_debug("Failed mapping APBT phy address at %p\n",\ - (void *)apbt_address); - goto panic_noapbt; - } - apbt_freq = mtmr->freq_hz / USEC_PER_SEC; - sfi_free_mtmr(mtmr); - - /* Now figure out the physical timer id for clocksource device */ - mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); - if (mtmr == NULL) - goto panic_noapbt; - - /* Now figure out the physical timer id */ - phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) - / APBTMRS_REG_SIZE; - pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); - return; + struct sfi_timer_table_entry *mtmr; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (unsigned long)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (apbt_virt_address) { + pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ + (void *)apbt_address, (void *)apbt_virt_address); + } else { + pr_debug("Failed mapping APBT phy address at %p\n",\ + (void *)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) + / APBTMRS_REG_SIZE; + pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + return; panic_noapbt: - panic("Failed to setup APB system timer\n"); + panic("Failed to setup APB system timer\n"); } static inline void apbt_clear_mapping(void) { - iounmap(apbt_virt_address); - apbt_virt_address = NULL; + iounmap(apbt_virt_address); + apbt_virt_address = NULL; } /* @@ -169,28 +169,28 @@ static inline void apbt_clear_mapping(void) */ static inline int is_apbt_capable(void) { - return apbt_virt_address ? 1 : 0; + return apbt_virt_address ? 1 : 0; } static struct clocksource clocksource_apbt = { - .name = "apbt", - .rating = APBT_CLOCKSOURCE_RATING, - .read = apbt_read_clocksource, - .mask = APBT_MASK, - .shift = APBT_SHIFT, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .resume = apbt_restart_clocksource, + .name = "apbt", + .rating = APBT_CLOCKSOURCE_RATING, + .read = apbt_read_clocksource, + .mask = APBT_MASK, + .shift = APBT_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = apbt_restart_clocksource, }; /* boot APB clock event device */ static struct clock_event_device apbt_clockevent = { - .name = "apbt0", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = apbt_set_mode, - .set_next_event = apbt_next_event, - .shift = APBT_SHIFT, - .irq = 0, - .rating = APBT_CLOCKEVENT_RATING, + .name = "apbt0", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = apbt_set_mode, + .set_next_event = apbt_next_event, + .shift = APBT_SHIFT, + .irq = 0, + .rating = APBT_CLOCKEVENT_RATING, }; /* @@ -199,20 +199,20 @@ static struct clock_event_device apbt_clockevent = { */ static inline int __init setup_x86_mrst_timer(char *arg) { - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - disable_apbt_percpu = 0; - else if (strcmp("lapic_and_apbt", arg) == 0) - disable_apbt_percpu = 1; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + disable_apbt_percpu = 0; + else if (strcmp("lapic_and_apbt", arg) == 0) + disable_apbt_percpu = 1; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; } __setup("x86_mrst_timer=", setup_x86_mrst_timer); @@ -222,176 +222,176 @@ __setup("x86_mrst_timer=", setup_x86_mrst_timer); */ static void apbt_start_counter(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); - /* enable, mask interrupt */ - ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; - ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - /* read it once to get cached counter value initialized */ - apbt_read_clocksource(&clocksource_apbt); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); + /* enable, mask interrupt */ + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + /* read it once to get cached counter value initialized */ + apbt_read_clocksource(&clocksource_apbt); } static irqreturn_t apbt_interrupt_handler(int irq, void *data) { - struct apbt_dev *dev = (struct apbt_dev *)data; - struct clock_event_device *aevt = &dev->evt; - - if (!aevt->event_handler) { - printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", - dev->num); - return IRQ_NONE; - } - aevt->event_handler(aevt); - return IRQ_HANDLED; + struct apbt_dev *dev = (struct apbt_dev *)data; + struct clock_event_device *aevt = &dev->evt; + + if (!aevt->event_handler) { + printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", + dev->num); + return IRQ_NONE; + } + aevt->event_handler(aevt); + return IRQ_HANDLED; } static void apbt_restart_clocksource(void) { - apbt_start_counter(phy_cs_timer_id); + apbt_start_counter(phy_cs_timer_id); } /* Setup IRQ routing via IOAPIC */ #ifdef CONFIG_SMP static void apbt_setup_irq(struct apbt_dev *adev) { - struct irq_chip *chip; - struct irq_desc *desc; - - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - desc = irq_to_desc(adev->irq); - chip = get_irq_chip(adev->irq); - disable_irq(adev->irq); - desc->status |= IRQ_MOVE_PCNTXT; - irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); - /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ - set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); - enable_irq(adev->irq); - if (system_state == SYSTEM_BOOTING) - if (request_irq(adev->irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - adev->name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - adev->num); - } + struct irq_chip *chip; + struct irq_desc *desc; + + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + desc = irq_to_desc(adev->irq); + chip = get_irq_chip(adev->irq); + disable_irq(adev->irq); + desc->status |= IRQ_MOVE_PCNTXT; + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ + set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); + enable_irq(adev->irq); + if (system_state == SYSTEM_BOOTING) + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } } #endif static void apbt_enable_int(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - /* clear pending intr */ - apbt_readl(n, APBTMR_N_EOI); - ctrl &= ~APBTMR_CONTROL_INT; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + /* clear pending intr */ + apbt_readl(n, APBTMR_N_EOI); + ctrl &= ~APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); } static void apbt_disable_int(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - ctrl |= APBTMR_CONTROL_INT; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); } static int __init apbt_clockevent_register(void) { - struct sfi_timer_table_entry *mtmr; - struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); - - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return -ENODEV; - } - - /* - * We need to calculate the scaled math multiplication factor for - * nanosecond to apbt tick conversion. - * mult = (nsec/cycle)*2^APBT_SHIFT - */ - apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz - , NSEC_PER_SEC, APBT_SHIFT); - - /* Calculate the min / max delta */ - apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &apbt_clockevent); - apbt_clockevent.min_delta_ns = clockevent_delta2ns( - APBT_MIN_DELTA_USEC*apbt_freq, - &apbt_clockevent); - /* - * Start apbt with the boot cpu mask and make it - * global if not used for per cpu timer. - */ - apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); - adev->num = smp_processor_id(); - memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); - - if (disable_apbt_percpu) { - apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz + , NSEC_PER_SEC, APBT_SHIFT); + + /* Calculate the min / max delta */ + apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &apbt_clockevent); + apbt_clockevent.min_delta_ns = clockevent_delta2ns( + APBT_MIN_DELTA_USEC*apbt_freq, + &apbt_clockevent); + /* + * Start apbt with the boot cpu mask and make it + * global if not used for per cpu timer. + */ + apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + adev->num = smp_processor_id(); + memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); + + if (disable_apbt_percpu) { + apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; - printk(KERN_DEBUG "%s clockevent registered as global\n", - global_clock_event->name); - } - - if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - apbt_clockevent.name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - apbt_clockevent.irq); - } - - clockevents_register_device(&adev->evt); - /* Start APBT 0 interrupts */ - apbt_enable_int(APBT_CLOCKEVENT0_NUM); - - sfi_free_mtmr(mtmr); - return 0; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + apbt_clockevent.name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + apbt_clockevent.irq); + } + + clockevents_register_device(&adev->evt); + /* Start APBT 0 interrupts */ + apbt_enable_int(APBT_CLOCKEVENT0_NUM); + + sfi_free_mtmr(mtmr); + return 0; } #ifdef CONFIG_SMP /* Should be called with per cpu */ void apbt_setup_secondary_clock(void) { - struct apbt_dev *adev; - struct clock_event_device *aevt; - int cpu; - - /* Don't register boot CPU clockevent */ - cpu = smp_processor_id(); - if (cpu == boot_cpu_id) - return; - /* - * We need to calculate the scaled math multiplication factor for - * nanosecond to apbt tick conversion. - * mult = (nsec/cycle)*2^APBT_SHIFT - */ - printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); - adev = &per_cpu(cpu_apbt_dev, cpu); - aevt = &adev->evt; - - memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); - aevt->cpumask = cpumask_of(cpu); - aevt->name = adev->name; - aevt->mode = CLOCK_EVT_MODE_UNUSED; - - printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", - cpu, aevt->name, *(u32 *)aevt->cpumask); - - apbt_setup_irq(adev); - - clockevents_register_device(aevt); - - apbt_enable_int(cpu); - - return; + struct apbt_dev *adev; + struct clock_event_device *aevt; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (cpu == boot_cpu_id) + return; + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); + adev = &per_cpu(cpu_apbt_dev, cpu); + aevt = &adev->evt; + + memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); + aevt->cpumask = cpumask_of(cpu); + aevt->name = adev->name; + aevt->mode = CLOCK_EVT_MODE_UNUSED; + + printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", + cpu, aevt->name, *(u32 *)aevt->cpumask); + + apbt_setup_irq(adev); + + clockevents_register_device(aevt); + + apbt_enable_int(cpu); + + return; } /* @@ -405,34 +405,34 @@ void apbt_setup_secondary_clock(void) * the extra interrupt is harmless. */ static int apbt_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) + unsigned long action, void *hcpu) { - unsigned long cpu = (unsigned long)hcpu; - struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - - switch (action & 0xf) { - case CPU_DEAD: - apbt_disable_int(cpu); - if (system_state == SYSTEM_RUNNING) - pr_debug("skipping APBT CPU %lu offline\n", cpu); - else if (adev) { - pr_debug("APBT clockevent for cpu %lu offline\n", cpu); - free_irq(adev->irq, adev); - } - break; - default: - pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); - } - return NOTIFY_OK; + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + apbt_disable_int(cpu); + if (system_state == SYSTEM_RUNNING) + pr_debug("skipping APBT CPU %lu offline\n", cpu); + else if (adev) { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + free_irq(adev->irq, adev); + } + break; + default: + pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; } static __init int apbt_late_init(void) { - if (disable_apbt_percpu) - return 0; - /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(apbt_cpuhp_notify, -20); - return 0; + if (disable_apbt_percpu) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; } fs_initcall(apbt_late_init); #else @@ -442,93 +442,93 @@ void apbt_setup_secondary_clock(void) {} #endif /* CONFIG_SMP */ static void apbt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long ctrl; - uint64_t delta; - int timer_num; - struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); - - timer_num = adev->num; - pr_debug("%s CPU %d timer %d mode=%d\n", - __func__, first_cpu(*evt->cpumask), timer_num, mode); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; - delta >>= apbt_clockevent.shift; - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl |= APBTMR_CONTROL_MODE_PERIODIC; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* - * DW APB p. 46, have to disable timer before load counter, - * may cause sync problem. - */ - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - udelay(1); - pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); - apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - /* APB timer does not have one-shot mode, use free running mode */ - case CLOCK_EVT_MODE_ONESHOT: - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - /* - * set free running mode, this mode will let timer reload max - * timeout which will give time (3min on 25MHz clock) to rearm - * the next event, therefore emulate the one-shot mode. - */ - ctrl &= ~APBTMR_CONTROL_ENABLE; - ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; - - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* write again to set free running mode */ - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - - /* - * DW APB p. 46, load counter with all 1s before starting free - * running mode. - */ - apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); - ctrl &= ~APBTMR_CONTROL_INT; - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - apbt_disable_int(timer_num); - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - - case CLOCK_EVT_MODE_RESUME: - apbt_enable_int(timer_num); - break; - } + unsigned long ctrl; + uint64_t delta; + int timer_num; + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + pr_debug("%s CPU %d timer %d mode=%d\n", + __func__, first_cpu(*evt->cpumask), timer_num, mode); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; + delta >>= apbt_clockevent.shift; + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_MODE_PERIODIC; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* + * DW APB p. 46, have to disable timer before load counter, + * may cause sync problem. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + udelay(1); + pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + /* APB timer does not have one-shot mode, use free running mode */ + case CLOCK_EVT_MODE_ONESHOT: + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + /* + * set free running mode, this mode will let timer reload max + * timeout which will give time (3min on 25MHz clock) to rearm + * the next event, therefore emulate the one-shot mode. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write again to set free running mode */ + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + + /* + * DW APB p. 46, load counter with all 1s before starting free + * running mode. + */ + apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); + ctrl &= ~APBTMR_CONTROL_INT; + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + apbt_disable_int(timer_num); + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_RESUME: + apbt_enable_int(timer_num); + break; + } } static int apbt_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long ctrl; - int timer_num; - - struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); - - timer_num = adev->num; - /* Disable timer */ - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* write new count */ - apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - return 0; + unsigned long ctrl; + int timer_num; + + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + /* Disable timer */ + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write new count */ + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + return 0; } /* @@ -540,94 +540,94 @@ static int apbt_next_event(unsigned long delta, */ static cycle_t apbt_read_clocksource(struct clocksource *cs) { - unsigned long t0, t1, t2; - static unsigned long last_read; + unsigned long t0, t1, t2; + static unsigned long last_read; bad_count: - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if (unlikely(t1 < t2)) { - pr_debug("APBT: read current count error %lx:%lx:%lx\n", - t1, t2, t2 - t1); - goto bad_count; - } - /* - * check against cached last read, makes sure time does not go back. - * it could be a normal rollover but we will do tripple check anyway - */ - if (unlikely(t2 > last_read)) { - /* check if we have a normal rollover */ - unsigned long raw_intr_status = - apbt_readl_reg(APBTMRS_RAW_INT_STATUS); - /* - * cs timer interrupt is masked but raw intr bit is set if - * rollover occurs. then we read EOI reg to clear it. - */ - if (raw_intr_status & (1 << phy_cs_timer_id)) { - apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); - goto out; - } - pr_debug("APB CS going back %lx:%lx:%lx ", - t2, last_read, t2 - last_read); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if (unlikely(t1 < t2)) { + pr_debug("APBT: read current count error %lx:%lx:%lx\n", + t1, t2, t2 - t1); + goto bad_count; + } + /* + * check against cached last read, makes sure time does not go back. + * it could be a normal rollover but we will do tripple check anyway + */ + if (unlikely(t2 > last_read)) { + /* check if we have a normal rollover */ + unsigned long raw_intr_status = + apbt_readl_reg(APBTMRS_RAW_INT_STATUS); + /* + * cs timer interrupt is masked but raw intr bit is set if + * rollover occurs. then we read EOI reg to clear it. + */ + if (raw_intr_status & (1 << phy_cs_timer_id)) { + apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); + goto out; + } + pr_debug("APB CS going back %lx:%lx:%lx ", + t2, last_read, t2 - last_read); bad_count_x3: - pr_debug(KERN_INFO "tripple check enforced\n"); - t0 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if ((t2 > t1) || (t1 > t0)) { - printk(KERN_ERR "Error: APB CS tripple check failed\n"); - goto bad_count_x3; - } - } + pr_debug(KERN_INFO "tripple check enforced\n"); + t0 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if ((t2 > t1) || (t1 > t0)) { + printk(KERN_ERR "Error: APB CS tripple check failed\n"); + goto bad_count_x3; + } + } out: - last_read = t2; - return (cycle_t)~t2; + last_read = t2; + return (cycle_t)~t2; } static int apbt_clocksource_register(void) { - u64 start, now; - cycle_t t1; - - /* Start the counter, use timer 2 as source, timer 0/1 for event */ - apbt_start_counter(phy_cs_timer_id); - - /* Verify whether apbt counter works */ - t1 = apbt_read_clocksource(&clocksource_apbt); - rdtscll(start); - - /* - * We don't know the TSC frequency yet, but waiting for - * 200000 TSC cycles is safe: - * 4 GHz == 50us - * 1 GHz == 200us - */ - do { - rep_nop(); - rdtscll(now); - } while ((now - start) < 200000UL); - - /* APBT is the only always on clocksource, it has to work! */ - if (t1 == apbt_read_clocksource(&clocksource_apbt)) - panic("APBT counter not counting. APBT disabled\n"); - - /* - * initialize and register APBT clocksource - * convert that to ns/clock cycle - * mult = (ns/c) * 2^APBT_SHIFT - */ - clocksource_apbt.mult = div_sc(MSEC_PER_SEC, - (unsigned long) apbt_freq, APBT_SHIFT); - clocksource_register(&clocksource_apbt); - - return 0; + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + apbt_start_counter(phy_cs_timer_id); + + /* Verify whether apbt counter works */ + t1 = apbt_read_clocksource(&clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == apbt_read_clocksource(&clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + /* + * initialize and register APBT clocksource + * convert that to ns/clock cycle + * mult = (ns/c) * 2^APBT_SHIFT + */ + clocksource_apbt.mult = div_sc(MSEC_PER_SEC, + (unsigned long) apbt_freq, APBT_SHIFT); + clocksource_register(&clocksource_apbt); + + return 0; } /* @@ -640,145 +640,145 @@ static int apbt_clocksource_register(void) void __init apbt_time_init(void) { #ifdef CONFIG_SMP - int i; - struct sfi_timer_table_entry *p_mtmr; - unsigned int percpu_timer; - struct apbt_dev *adev; + int i; + struct sfi_timer_table_entry *p_mtmr; + unsigned int percpu_timer; + struct apbt_dev *adev; #endif - if (apb_timer_block_enabled) - return; - apbt_set_mapping(); - if (apbt_virt_address) { - pr_debug("Found APBT version 0x%lx\n",\ - apbt_readl_reg(APBTMRS_COMP_VERSION)); - } else - goto out_noapbt; - /* - * Read the frequency and check for a sane value, for ESL model - * we extend the possible clock range to allow time scaling. - */ - - if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { - pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); - goto out_noapbt; - } - if (apbt_clocksource_register()) { - pr_debug("APBT has failed to register clocksource\n"); - goto out_noapbt; - } - if (!apbt_clockevent_register()) - apb_timer_block_enabled = 1; - else { - pr_debug("APBT has failed to register clockevent\n"); - goto out_noapbt; - } + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (apbt_virt_address) { + pr_debug("Found APBT version 0x%lx\n",\ + apbt_readl_reg(APBTMRS_COMP_VERSION)); + } else + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } #ifdef CONFIG_SMP - /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (disable_apbt_percpu) { - printk(KERN_INFO "apbt: disabled per cpu timer\n"); - return; - } - pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); - if (num_possible_cpus() <= sfi_mtimer_num) { - percpu_timer = 1; - apbt_num_timers_used = num_possible_cpus(); - } else { - percpu_timer = 0; - apbt_num_timers_used = 1; - adev = &per_cpu(cpu_apbt_dev, 0); - adev->flags &= ~APBT_DEV_USED; - } - pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); - - /* here we set up per CPU timer data structure */ - apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, - GFP_KERNEL); - if (!apbt_devs) { - printk(KERN_ERR "Failed to allocate APB timer devices\n"); - return; - } - for (i = 0; i < apbt_num_timers_used; i++) { - adev = &per_cpu(cpu_apbt_dev, i); - adev->num = i; - adev->cpu = i; - p_mtmr = sfi_get_mtmr(i); - if (p_mtmr) { - adev->tick = p_mtmr->freq_hz; - adev->irq = p_mtmr->irq; - } else - printk(KERN_ERR "Failed to get timer for cpu %d\n", i); - adev->count = 0; - sprintf(adev->name, "apbt%d", i); - } + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (disable_apbt_percpu) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) { + percpu_timer = 1; + apbt_num_timers_used = num_possible_cpus(); + } else { + percpu_timer = 0; + apbt_num_timers_used = 1; + adev = &per_cpu(cpu_apbt_dev, 0); + adev->flags &= ~APBT_DEV_USED; + } + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, + GFP_KERNEL); + if (!apbt_devs) { + printk(KERN_ERR "Failed to allocate APB timer devices\n"); + return; + } + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) { + adev->tick = p_mtmr->freq_hz; + adev->irq = p_mtmr->irq; + } else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + adev->count = 0; + sprintf(adev->name, "apbt%d", i); + } #endif - return; + return; out_noapbt: - apbt_clear_mapping(); - apb_timer_block_enabled = 0; - panic("failed to enable APB timer\n"); + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); } static inline void apbt_disable(int n) { - if (is_apbt_capable()) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - } + if (is_apbt_capable()) { + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + } } /* called before apb_timer_enable, use early map */ unsigned long apbt_quick_calibrate() { - int i, scale; - u64 old, new; - cycle_t t1, t2; - unsigned long khz = 0; - u32 loop, shift; - - apbt_set_mapping(); - apbt_start_counter(phy_cs_timer_id); - - /* check if the timer can count down, otherwise return */ - old = apbt_read_clocksource(&clocksource_apbt); - i = 10000; - while (--i) { - if (old != apbt_read_clocksource(&clocksource_apbt)) - break; - } - if (!i) - goto failed; - - /* count 16 ms */ - loop = (apbt_freq * 1000) << 4; - - /* restart the timer to ensure it won't get to 0 in the calibration */ - apbt_start_counter(phy_cs_timer_id); - - old = apbt_read_clocksource(&clocksource_apbt); - old += loop; - - t1 = __native_read_tsc(); - - do { - new = apbt_read_clocksource(&clocksource_apbt); - } while (new < old); - - t2 = __native_read_tsc(); - - shift = 5; - if (unlikely(loop >> shift == 0)) { - printk(KERN_INFO - "APBT TSC calibration failed, not enough resolution\n"); - return 0; - } - scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * apbt_freq * 1000) >> shift; - printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); - return khz; + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + apbt_start_counter(phy_cs_timer_id); + + /* check if the timer can count down, otherwise return */ + old = apbt_read_clocksource(&clocksource_apbt); + i = 10000; + while (--i) { + if (old != apbt_read_clocksource(&clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq * 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + apbt_start_counter(phy_cs_timer_id); + + old = apbt_read_clocksource(&clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = apbt_read_clocksource(&clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * apbt_freq * 1000) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; failed: - return 0; + return 0; } -- cgit v1.2.2 From e5a11016643d1ab7172193591506d33a844734cc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Wed, 3 Mar 2010 22:38:50 -0500 Subject: x86: Issue at least one memory barrier in stop_machine_text_poke() Fix stop_machine_text_poke() to issue smp_mb() before exiting waiting loop, and use cpu_relax() for waiting. Changes in v2: - Don't use ACCESS_ONCE(). Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: systemtap <systemtap@sources.redhat.com> Cc: DLE <dle-develop@lists.sourceforge.net> Cc: Jason Baron <jbaron@redhat.com> LKML-Reference: <20100304033850.3819.74590.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/alternative.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c41f13c15e8f..e0b877099470 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -595,8 +595,8 @@ static int __kprobes stop_machine_text_poke(void *data) wrote_text = 1; } else { while (!wrote_text) - smp_rmb(); - sync_core(); + cpu_relax(); + smp_mb(); /* Load wrote_text before following execution */ } flush_icache_range((unsigned long)tpp->addr, -- cgit v1.2.2 From 6c550ee41596798cbd873d3df9f8ea0a4ce7ad2f Mon Sep 17 00:00:00 2001 From: Randy Dunlap <randy.dunlap@oracle.com> Date: Fri, 5 Mar 2010 09:52:52 -0800 Subject: x86: fix mtrr missing kernel-doc Fix missing kernel-doc notation in mtrr/main.c: Warning(arch/x86/kernel/cpu/mtrr/main.c:152): No description found for parameter 'info' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/cpu/mtrr/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index fe4622e8c837..79556bd9b602 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -145,6 +145,7 @@ struct set_mtrr_data { /** * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * @info: pointer to mtrr configuration data * * Returns nothing. */ -- cgit v1.2.2 From 984b3f5746ed2cde3d184651dabf26980f2b66e5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita <akinobu.mita@gmail.com> Date: Fri, 5 Mar 2010 13:41:37 -0800 Subject: bitops: rename for_each_bit() to for_each_set_bit() Rename for_each_bit to for_each_set_bit in the kernel source tree. To permit for_each_clear_bit(), should that ever be added. The patch includes a macro to map the old for_each_bit() onto the new for_each_set_bit(). This is a (very) temporary thing to ease the migration. [akpm@linux-foundation.org: add temporary for_each_bit()] Suggested-by: Alexey Dobriyan <adobriyan@gmail.com> Suggested-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Russell King <rmk@arm.linux.org.uk> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Artem Bityutskiy <dedekind@infradead.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9dddbc..b1fbdeecf6c9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -676,7 +676,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (c->weight != w) continue; - for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590cf4a5f..977e7544738c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -757,7 +757,7 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); -- cgit v1.2.2 From 52cf25d0ab7f78eeecc59ac652ed5090f69b619e Mon Sep 17 00:00:00 2001 From: Emese Revfy <re.emese@gmail.com> Date: Tue, 19 Jan 2010 02:58:23 +0100 Subject: Driver core: Constify struct sysfs_ops in struct kobj_type Constify struct sysfs_ops. This is part of the ops structure constification effort started by Arjan van de Ven et al. Benefits of this constification: * prevents modification of data that is shared (referenced) by many other structure instances at runtime * detects/prevents accidental (but not intentional) modification attempts on archs that enforce read-only kernel data at runtime * potentially better optimized code as the compiler can assume that the const data cannot be changed * the compiler/linker move const data into .rodata and therefore exclude them from false sharing Signed-off-by: Emese Revfy <re.emese@gmail.com> Acked-by: David Teigland <teigland@redhat.com> Acked-by: Matt Domsch <Matt_Domsch@dell.com> Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com> Acked-by: Hans J. Koch <hjk@linutronix.de> Acked-by: Pekka Enberg <penberg@cs.helsinki.fi> Acked-by: Jens Axboe <jens.axboe@oracle.com> Acked-by: Stephen Hemminger <shemminger@vyatta.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index eddb1bdd1b8f..b3eeb66c0a51 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -903,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = { .show = show, .store = store, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 83a3d1f4efca..cda932ca3ade 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -388,7 +388,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops threshold_ops = { +static const struct sysfs_ops threshold_ops = { .show = show, .store = store, }; -- cgit v1.2.2 From a07e4156a2ee6359d31a44946d7ee7f85dbf6bca Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Thu, 11 Feb 2010 15:23:05 -0800 Subject: sysfs: Use sysfs_attr_init and sysfs_bin_attr_init on dynamic attributes These are the non-static sysfs attributes that exist on my test machine. Fix them to use sysfs_attr_init or sysfs_bin_attr_init as appropriate. It simply requires making a sysfs attribute present to see this. So this is a little bit tedious but otherwise not too bad. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Acked-by: WANG Cong <xiyou.wangcong@gmail.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a8aacd4b513c..28cba46bf32c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2044,6 +2044,7 @@ static __init void mce_init_banks(void) struct mce_bank *b = &mce_banks[i]; struct sysdev_attribute *a = &b->attr; + sysfs_attr_init(&a->attr); a->attr.name = b->attrname; snprintf(b->attrname, ATTR_LEN, "bank%d", i); -- cgit v1.2.2 From 8b408fe4f853dcfa18d133aa4cf1d7546b4c3870 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Mon, 8 Mar 2010 14:20:07 +0100 Subject: x86/amd-iommu: Use helper function to destroy domain In the amd_iommu_domain_destroy the protection_domain_free function is partly reimplemented. The 'partly' is the bug here because the domain is not deleted from the domain list. This results in use-after-free errors and data-corruption. Fix it by just using protection_domain_free instead. Cc: stable@kernel.org Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0c0425436a73..b06f29e275e9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2380,9 +2380,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) free_pagetable(domain); - domain_id_free(domain->id); - - kfree(domain); + protection_domain_free(domain); dom->priv = NULL; } -- cgit v1.2.2 From dc1d628a67a8f042e711ea5accc0beedc3ef0092 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Wed, 3 Mar 2010 15:55:04 +0100 Subject: perf: Provide generic perf_sample_data initialization This makes it easier to extend perf_sample_data and fixes a bug on arm and sparc, which failed to set ->raw to NULL, which can cause crashes when combined with PERF_SAMPLE_RAW. It also optimizes PowerPC and tracepoint, because the struct initialization is forced to zero out the whole structure. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Jean Pihet <jpihet@mvista.com> Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> Acked-by: David S. Miller <davem@davemloft.net> Cc: Jamie Iles <jamie.iles@picochip.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Stephane Eranian <eranian@google.com> Cc: stable@kernel.org LKML-Reference: <20100304140100.315416040@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 3 +-- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 97cddbf32936..42aafd11e170 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1097,8 +1097,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 73102df8bfc1..44b60c852107 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -590,10 +590,9 @@ static void intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; regs.ip = 0; /* @@ -742,8 +741,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 ack, status; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); -- cgit v1.2.2 From 3f6da3905398826d85731247e7fbcf53400c18bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Fri, 5 Mar 2010 13:01:18 +0100 Subject: perf: Rework and fix the arch CPU-hotplug hooks Remove the hw_perf_event_*() hotplug hooks in favour of per PMU hotplug notifiers. This has the advantage of reducing the static weak interface as well as exposing all hotplug actions to the PMU. Use this to fix x86 hotplug usage where we did things in ONLINE which should have been done in UP_PREPARE or STARTING. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Paul Mundt <lethal@linux-sh.org> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo <acme@infradead.org> LKML-Reference: <20100305154128.736225361@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 70 ++++++++++++++++++++-------------- arch/x86/kernel/cpu/perf_event_amd.c | 60 ++++++++++++----------------- arch/x86/kernel/cpu/perf_event_intel.c | 5 ++- 3 files changed, 71 insertions(+), 64 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 42aafd11e170..585d5608ae6b 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -157,6 +157,11 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; + + void (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); }; static struct x86_pmu x86_pmu __read_mostly; @@ -293,7 +298,7 @@ static inline bool bts_available(void) return x86_pmu.enable_bts != NULL; } -static inline void init_debug_store_on_cpu(int cpu) +static void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -305,7 +310,7 @@ static inline void init_debug_store_on_cpu(int cpu) (u32)((u64)(unsigned long)ds >> 32)); } -static inline void fini_debug_store_on_cpu(int cpu) +static void fini_debug_store_on_cpu(int cpu) { if (!per_cpu(cpu_hw_events, cpu).ds) return; @@ -1337,6 +1342,39 @@ undo: #include "perf_event_p6.c" #include "perf_event_intel.c" +static int __cpuinit +x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + if (x86_pmu.cpu_prepare) + x86_pmu.cpu_prepare(cpu); + break; + + case CPU_STARTING: + if (x86_pmu.cpu_starting) + x86_pmu.cpu_starting(cpu); + break; + + case CPU_DYING: + if (x86_pmu.cpu_dying) + x86_pmu.cpu_dying(cpu); + break; + + case CPU_DEAD: + if (x86_pmu.cpu_dead) + x86_pmu.cpu_dead(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + static void __init pmu_check_apic(void) { if (cpu_has_apic) @@ -1415,6 +1453,8 @@ void __init init_hw_perf_events(void) pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); pr_info("... event mask: %016Lx\n", perf_event_mask); + + perf_cpu_notifier(x86_pmu_notifier); } static inline void x86_pmu_read(struct perf_event *event) @@ -1674,29 +1714,3 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } - -void hw_perf_event_setup_online(int cpu) -{ - init_debug_store_on_cpu(cpu); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - amd_pmu_cpu_online(cpu); - break; - default: - return; - } -} - -void hw_perf_event_setup_offline(int cpu) -{ - init_debug_store_on_cpu(cpu); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - amd_pmu_cpu_offline(cpu); - break; - default: - return; - } -} diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 8f3dbfda3c4f..014528ba7d57 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -271,28 +271,6 @@ done: return &emptyconstraint; } -static __initconst struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints -}; - static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) { struct amd_nb *nb; @@ -378,6 +356,31 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_unlock(&amd_nb_lock); } +static __initconst struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints, + + .cpu_prepare = amd_pmu_cpu_online, + .cpu_dead = amd_pmu_cpu_offline, +}; + static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ @@ -390,11 +393,6 @@ static __init int amd_pmu_init(void) memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - /* - * explicitly initialize the boot cpu, other cpus will get - * the cpu hotplug callbacks from smp_init() - */ - amd_pmu_cpu_online(smp_processor_id()); return 0; } @@ -405,12 +403,4 @@ static int amd_pmu_init(void) return 0; } -static void amd_pmu_cpu_online(int cpu) -{ -} - -static void amd_pmu_cpu_offline(int cpu) -{ -} - #endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 44b60c852107..12e811a7d747 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -870,7 +870,10 @@ static __initconst struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, - .get_event_constraints = intel_get_event_constraints + .get_event_constraints = intel_get_event_constraints, + + .cpu_starting = init_debug_store_on_cpu, + .cpu_dying = fini_debug_store_on_cpu, }; static __init int intel_pmu_init(void) -- cgit v1.2.2 From 3fb2b8ddcc6a7aa62af6bd2cb939edfd4c460506 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Mon, 8 Mar 2010 13:51:01 +0100 Subject: perf, x86, Do not user perf_disable from NMI context Explicitly use intel_pmu_{disable,enable}_all() in intel_pmu_handle_irq() to avoid the NMI race conditions in perf_{disable,enable} Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@infradead.org> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event_intel.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 12e811a7d747..c582449163fa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -745,11 +745,11 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); - perf_disable(); + intel_pmu_disable_all(); intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); if (!status) { - perf_enable(); + intel_pmu_enable_all(); return 0; } @@ -759,8 +759,7 @@ again: WARN_ONCE(1, "perfevents: irq loop stuck!\n"); perf_event_print_debug(); intel_pmu_reset(); - perf_enable(); - return 1; + goto done; } inc_irq_stat(apic_perf_irqs); @@ -790,8 +789,8 @@ again: if (status) goto again; - perf_enable(); - +done: + intel_pmu_enable_all(); return 1; } -- cgit v1.2.2 From 07088edb88164c2a2406cd2d9a7be19d8515214b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Tue, 2 Mar 2010 20:16:01 +0100 Subject: perf, x86: Remove superfluous arguments to x86_perf_event_set_period() The second and third argument to x86_perf_event_set_period() are superfluous since they are simple expressions of the first argument. Hence remove them. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo <acme@infradead.org> LKML-Reference: <20100304140100.006500906@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 15 +++++++-------- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 585d5608ae6b..fcf1788f9626 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -170,8 +170,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static int x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx); +static int x86_perf_event_set_period(struct perf_event *event); /* * Generalized hw caching related hw_event table, filled @@ -835,7 +834,7 @@ void hw_perf_enable(void) if (hwc->idx == -1) { x86_assign_hw_event(event, cpuc, i); - x86_perf_event_set_period(event, hwc, hwc->idx); + x86_perf_event_set_period(event); } /* * need to mark as active because x86_pmu_disable() @@ -876,12 +875,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); * To be called with the event disabled in hw: */ static int -x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; - int err, ret = 0; + int err, ret = 0, idx = hwc->idx; if (idx == X86_PMC_IDX_FIXED_BTS) return 0; @@ -979,7 +978,7 @@ static int x86_pmu_start(struct perf_event *event) if (hwc->idx == -1) return -EAGAIN; - x86_perf_event_set_period(event, hwc, hwc->idx); + x86_perf_event_set_period(event); x86_pmu.enable(hwc, hwc->idx); return 0; @@ -1123,7 +1122,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) handled = 1; data.period = event->hw.last_period; - if (!x86_perf_event_set_period(event, hwc, idx)) + if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c582449163fa..6dbdf91ab342 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -699,7 +699,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event) int ret; x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); + ret = x86_perf_event_set_period(event); return ret; } -- cgit v1.2.2 From cc2ad4ba8792b9d4ff893ae3b845d2c5a6206fc9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Tue, 2 Mar 2010 20:18:39 +0100 Subject: perf, x86: Remove superfluous arguments to x86_perf_event_update() The second and third argument to x86_perf_event_update() are superfluous since they are simple expressions of the first argument. Hence remove them. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo <acme@infradead.org> LKML-Reference: <20100304140100.089468871@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 11 ++++++----- arch/x86/kernel/cpu/perf_event_intel.c | 10 ++-------- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fcf1788f9626..086127ba580f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -193,11 +193,12 @@ static u64 __read_mostly hw_cache_event_ids * Returns the delta events processed. */ static u64 -x86_perf_event_update(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_update(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; + int idx = hwc->idx; s64 delta; if (idx == X86_PMC_IDX_FIXED_BTS) @@ -1064,7 +1065,7 @@ static void x86_pmu_stop(struct perf_event *event) * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event, hwc, idx); + x86_perf_event_update(event); cpuc->events[idx] = NULL; } @@ -1112,7 +1113,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; hwc = &event->hw; - val = x86_perf_event_update(event, hwc, idx); + val = x86_perf_event_update(event); if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; @@ -1458,7 +1459,7 @@ void __init init_hw_perf_events(void) static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event, &event->hw, event->hw.idx); + x86_perf_event_update(event); } static const struct pmu pmu = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6dbdf91ab342..a4c9f160448e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -694,14 +694,8 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) */ static int intel_pmu_save_and_restart(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event); - - return ret; + x86_perf_event_update(event); + return x86_perf_event_set_period(event); } static void intel_pmu_reset(void) -- cgit v1.2.2 From aff3d91a913c9ae0c2f56b65b27cbd00c7d27ee3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Tue, 2 Mar 2010 20:32:08 +0100 Subject: perf, x86: Change x86_pmu.{enable,disable} calling convention Pass the full perf_event into the x86_pmu functions so that those may make use of more than the hw_perf_event, and while doing this, remove the superfluous second argument. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo <acme@infradead.org> LKML-Reference: <20100304140100.165166129@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 31 +++++++++++++++---------------- arch/x86/kernel/cpu/perf_event_intel.c | 30 +++++++++++++++++------------- arch/x86/kernel/cpu/perf_event_p6.c | 10 ++++++---- 3 files changed, 38 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 086127ba580f..2dd704fa1299 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -133,8 +133,8 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_event *, int); - void (*disable)(struct hw_perf_event *, int); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -845,7 +845,7 @@ void hw_perf_enable(void) set_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = event; - x86_pmu.enable(hwc, hwc->idx); + x86_pmu.enable(event); perf_event_update_userpage(event); } cpuc->n_added = 0; @@ -858,15 +858,16 @@ void hw_perf_enable(void) x86_pmu.enable_all(); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) { - (void)checking_wrmsrl(hwc->config_base + idx, + (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); } -static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline void x86_pmu_disable_event(struct perf_event *event) { - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); + struct hw_perf_event *hwc = &event->hw; + (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -927,11 +928,11 @@ x86_perf_event_set_period(struct perf_event *event) return ret; } -static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) - __x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(&event->hw); } /* @@ -974,13 +975,11 @@ static int x86_pmu_enable(struct perf_event *event) static int x86_pmu_start(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; - - if (hwc->idx == -1) + if (event->hw.idx == -1) return -EAGAIN; x86_perf_event_set_period(event); - x86_pmu.enable(hwc, hwc->idx); + x86_pmu.enable(event); return 0; } @@ -994,7 +993,7 @@ static void x86_pmu_unthrottle(struct perf_event *event) cpuc->events[hwc->idx] != event)) return; - x86_pmu.enable(hwc, hwc->idx); + x86_pmu.enable(event); } void perf_event_print_debug(void) @@ -1059,7 +1058,7 @@ static void x86_pmu_stop(struct perf_event *event) * could reenable again: */ clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); + x86_pmu.disable(event); /* * Drain the remaining delta count out of a event @@ -1127,7 +1126,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu.disable(hwc, idx); + x86_pmu.disable(event); } if (handled) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a4c9f160448e..a84094897799 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -548,9 +548,9 @@ static inline void intel_pmu_ack_status(u64 ack) } static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = __idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -621,26 +621,28 @@ static void intel_pmu_drain_bts_buffer(void) } static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +intel_pmu_disable_event(struct perf_event *event) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; } if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); + intel_pmu_disable_fixed(hwc); return; } - x86_pmu_disable_event(hwc, idx); + x86_pmu_disable_event(event); } static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = __idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; int err; @@ -670,9 +672,11 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void intel_pmu_enable_event(struct perf_event *event) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { if (!__get_cpu_var(cpu_hw_events).enabled) return; @@ -681,11 +685,11 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) } if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); + intel_pmu_enable_fixed(hwc); return; } - __x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(hwc); } /* @@ -771,7 +775,7 @@ again: data.period = event->hw.last_period; if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); + intel_pmu_disable_event(event); } intel_pmu_ack_status(ack); diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index a4e67b99d91c..a330485d14da 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -77,27 +77,29 @@ static void p6_pmu_enable_all(void) } static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +p6_pmu_disable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + idx, val); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); } -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void p6_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + idx, val); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); } static __initconst struct x86_pmu p6_pmu = { -- cgit v1.2.2 From 34538ee77b39a12702e0f4c3ed9e8fa2dd5eb92c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Tue, 2 Mar 2010 21:16:55 +0100 Subject: perf, x86: Use unlocked bitops There is no concurrency on these variables, so don't use LOCK'ed ops. As to the intel_pmu_handle_irq() status bit clean, nobody uses that so remove it all together. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo <acme@infradead.org> LKML-Reference: <20100304140100.240023029@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 8 ++++---- arch/x86/kernel/cpu/perf_event_amd.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2dd704fa1299..01b166737424 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -643,7 +643,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (test_bit(hwc->idx, used_mask)) break; - set_bit(hwc->idx, used_mask); + __set_bit(hwc->idx, used_mask); if (assign) assign[i] = hwc->idx; } @@ -692,7 +692,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; - set_bit(j, used_mask); + __set_bit(j, used_mask); if (assign) assign[i] = j; @@ -842,7 +842,7 @@ void hw_perf_enable(void) * clear active_mask and events[] yet it preserves * idx */ - set_bit(hwc->idx, cpuc->active_mask); + __set_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = event; x86_pmu.enable(event); @@ -1057,7 +1057,7 @@ static void x86_pmu_stop(struct perf_event *event) * Must be done before we disable, otherwise the nmi handler * could reenable again: */ - clear_bit(idx, cpuc->active_mask); + __clear_bit(idx, cpuc->active_mask); x86_pmu.disable(event); /* diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 014528ba7d57..573458f1caf2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -287,7 +287,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) * initialize all possible NB constraints */ for (i = 0; i < x86_pmu.num_events; i++) { - set_bit(i, nb->event_constraints[i].idxmsk); + __set_bit(i, nb->event_constraints[i].idxmsk); nb->event_constraints[i].weight = 1; } return nb; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a84094897799..d87421c3f55b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -765,7 +765,6 @@ again: for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; - clear_bit(bit, (unsigned long *) &status); if (!test_bit(bit, cpuc->active_mask)) continue; -- cgit v1.2.2 From c08053e627d23490a03431285b78b7a5b617fbad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Sat, 6 Mar 2010 13:19:24 +0100 Subject: perf, x86: Fix x86_pmu_start pmu::start should undo pmu::stop, make it so. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@infradead.org> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 01b166737424..9757b96f15f5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -785,6 +785,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, hwc->last_tag == cpuc->tags[i]; } +static int x86_pmu_start(struct perf_event *event); static void x86_pmu_stop(struct perf_event *event); void hw_perf_enable(void) @@ -833,20 +834,10 @@ void hw_perf_enable(void) event = cpuc->event_list[i]; hwc = &event->hw; - if (hwc->idx == -1) { + if (hwc->idx == -1) x86_assign_hw_event(event, cpuc, i); - x86_perf_event_set_period(event); - } - /* - * need to mark as active because x86_pmu_disable() - * clear active_mask and events[] yet it preserves - * idx - */ - __set_bit(hwc->idx, cpuc->active_mask); - cpuc->events[hwc->idx] = event; - x86_pmu.enable(event); - perf_event_update_userpage(event); + x86_pmu_start(event); } cpuc->n_added = 0; perf_events_lapic_init(); @@ -975,11 +966,17 @@ static int x86_pmu_enable(struct perf_event *event) static int x86_pmu_start(struct perf_event *event) { - if (event->hw.idx == -1) + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx = event->hw.idx; + + if (idx == -1) return -EAGAIN; x86_perf_event_set_period(event); + cpuc->events[idx] = event; + __set_bit(idx, cpuc->active_mask); x86_pmu.enable(event); + perf_event_update_userpage(event); return 0; } -- cgit v1.2.2 From 71e2d2828046133ed985696a02e2e1499ca0bfb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Mon, 8 Mar 2010 17:51:33 +0100 Subject: perf, x86: Avoid double disable on throttle vs ioctl(PERF_IOC_DISABLE) Calling ioctl(PERF_EVENT_IOC_DISABLE) on a thottled counter would result in a double disable, cure this by using x86_pmu_{start,stop} for throttle/unthrottle and teach x86_pmu_stop() to check ->active_mask. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@infradead.org> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 20 ++++++-------------- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 2 files changed, 7 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9757b96f15f5..b68c4fb7a944 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -983,14 +983,8 @@ static int x86_pmu_start(struct perf_event *event) static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->events[hwc->idx] != event)) - return; - - x86_pmu.enable(event); + int ret = x86_pmu_start(event); + WARN_ON_ONCE(ret); } void perf_event_print_debug(void) @@ -1050,11 +1044,9 @@ static void x86_pmu_stop(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - __clear_bit(idx, cpuc->active_mask); + if (!__test_and_clear_bit(idx, cpuc->active_mask)) + return; + x86_pmu.disable(event); /* @@ -1123,7 +1115,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu.disable(event); + x86_pmu_stop(event); } if (handled) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d87421c3f55b..84bfde64a337 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -774,7 +774,7 @@ again: data.period = event->hw.last_period; if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(event); + x86_pmu_stop(event); } intel_pmu_ack_status(ack); -- cgit v1.2.2 From 356e1f2e0ace2d4b100c8eda9d49b709e8323da5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Sat, 6 Mar 2010 13:49:56 +0100 Subject: perf, x86: Properly account n_added Make sure n_added is properly accounted so that we can rely on the value to reflect the number of added counters. This is needed if its going to be used for more than a boolean check. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@infradead.org> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b68c4fb7a944..071c8405debd 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -959,7 +959,7 @@ static int x86_pmu_enable(struct perf_event *event) memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->n_events = n; - cpuc->n_added = n - n0; + cpuc->n_added += n - n0; return 0; } @@ -1302,7 +1302,7 @@ int hw_perf_group_sched_in(struct perf_event *leader, memcpy(cpuc->assign, assign, n0*sizeof(int)); cpuc->n_events = n0; - cpuc->n_added = n1; + cpuc->n_added += n1; ctx->nr_active += n1; /* -- cgit v1.2.2 From 19925ce778f9fc371b9607625de3bff04c60121e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Sat, 6 Mar 2010 13:20:40 +0100 Subject: perf, x86: Fix double disable calls hw_perf_enable() would disable events that were not yet enabled. This causes problems with code that assumes that ->enable/->disable calls are balanced (like the LBR code does). What happens is that we disable newly added counters that match their previous assignment, even though they are not yet programmed on the hardware. Avoid this by only doing the first pass over the existing events. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@infradead.org> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 071c8405debd..045cc0bb4c17 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -802,6 +802,7 @@ void hw_perf_enable(void) return; if (cpuc->n_added) { + int n_running = cpuc->n_events - cpuc->n_added; /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() @@ -809,7 +810,7 @@ void hw_perf_enable(void) * step1: save events moving to new counters * step2: reprogram moved events into new counters */ - for (i = 0; i < cpuc->n_events; i++) { + for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; -- cgit v1.2.2 From f3d46b2e6fa57547f9884330798792afc83f4b04 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Sat, 6 Mar 2010 13:24:58 +0100 Subject: perf, x86: Fix double enable calls hw_perf_enable() would enable already enabled events. This causes problems with code that assumes that ->enable/->disable calls are balanced (like the LBR code does). What happens is that events that were already running and left in place would get enabled again. Avoid this by only enabling new events that match their previous assignment. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@infradead.org> Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 045cc0bb4c17..1d665a0b202c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -835,6 +835,10 @@ void hw_perf_enable(void) event = cpuc->event_list[i]; hwc = &event->hw; + if (i < n_running && + match_prev_assignment(hwc, cpuc, i)) + continue; + if (hwc->idx == -1) x86_assign_hw_event(event, cpuc, i); -- cgit v1.2.2 From 61e67fb9d3ed13e6a7f58652ae4979b9c872fa57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Wed, 3 Mar 2010 07:38:37 +0100 Subject: perf/x86-64: Use frame pointer to walk on irq and process stacks We were using the frame pointer based stack walker on every contexts in x86-32, but not in x86-64 where we only use the seven-league boots on the exception stacks. Use it also on irq and process stacks. This utterly accelerate the captures. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> --- arch/x86/kernel/dumpstack_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d5e2a2ebb627..272c9f1f05f3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -208,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; - bp = print_context_stack(tinfo, stack, bp, + bp = ops->walk_stack(tinfo, stack, bp, ops, data, irq_stack_end, &graph); /* * We link to the next stack (which would be @@ -229,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -- cgit v1.2.2 From 5331d7b84613b8325362dde53dc2bff2fb87d351 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Thu, 4 Mar 2010 21:15:56 +0100 Subject: perf: Introduce new perf_fetch_caller_regs() for hot regs snapshot Events that trigger overflows by interrupting a context can use get_irq_regs() or task_pt_regs() to retrieve the state when the event triggered. But this is not the case for some other class of events like trace events as tracepoints are executed in the same context than the code that triggered the event. It means we need a different api to capture the regs there, namely we need a hot snapshot to get the most important informations for perf: the instruction pointer to get the event origin, the frame pointer for the callchain, the code segment for user_mode() tests (we always use __KERNEL_CS as trace events always occur from the kernel) and the eflags for further purposes. v2: rename perf_save_regs to perf_fetch_caller_regs as per Masami's suggestion. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Masami Hiramatsu <mhiramat@redhat.com> Cc: Jason Baron <jbaron@redhat.com> Cc: Archs <linux-arch@vger.kernel.org> --- arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++++ arch/x86/kernel/dumpstack.h | 15 +++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1d665a0b202c..c6bde7d7afdc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1707,3 +1707,15 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } + +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) +{ + regs->ip = ip; + /* + * perf_arch_fetch_caller_regs adds another call, we need to increment + * the skip level + */ + regs->bp = rewind_frame_pointer(skip + 1); + regs->cs = __KERNEL_CS; + local_save_flags(regs->flags); +} diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 4fd1420faffa..29e5f7c845b2 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,4 +29,19 @@ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; + +static inline unsigned long rewind_frame_pointer(int n) +{ + struct stack_frame *frame; + + get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER + while (n--) + frame = frame->next_frame; #endif + + return (unsigned long)frame; +} + +#endif /* DUMPSTACK_H */ -- cgit v1.2.2 From f56e8a0765cc4374e02f4e3a79e2427b5096b075 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Fri, 5 Mar 2010 15:03:27 -0800 Subject: x86/mce: Fix RCU lockdep splats Create an rcu_dereference_check_mce() that checks for RCU-sched read side and mce_read_mutex being held on update side. Replace uses of rcu_dereference() in arch/x86/kernel/cpu/mcheck/mce.c with this new macro. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: x86@kernel.org Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267830207-9474-3-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mcheck/mce.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a8aacd4b513c..4442e9e898c2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,11 @@ #include "mce-internal.h" +#define rcu_dereference_check_mce(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&mce_read_mutex)) + #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -158,7 +163,7 @@ void mce_log(struct mce *mce) mce->finished = 0; wmb(); for (;;) { - entry = rcu_dereference(mcelog.next); + entry = rcu_dereference_check_mce(mcelog.next); for (;;) { /* * When the buffer fills up discard new entries. @@ -1500,7 +1505,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, return -ENOMEM; mutex_lock(&mce_read_mutex); - next = rcu_dereference(mcelog.next); + next = rcu_dereference_check_mce(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { @@ -1565,7 +1570,7 @@ timeout: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) + if (rcu_dereference_check_mce(mcelog.next)) return POLLIN | POLLRDNORM; return 0; } -- cgit v1.2.2 From 10fb7f1f2d311b4d2e5d881fe2d83f1c281100f9 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Fri, 5 Mar 2010 13:10:36 -0600 Subject: x86: Reduce per cpu MCA boot up messages Don't write per cpu MCA boot up messages. Signed-of-by: Mike Travis <travis@sgi.com> Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Cc: x86@kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 7c785634af2b..d15df6e49bf0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -95,7 +95,7 @@ static void cmci_discover(int banks, int boot) /* Already owned by someone else? */ if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) + if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; @@ -107,7 +107,7 @@ static void cmci_discover(int banks, int boot) /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) + if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); } else { -- cgit v1.2.2 From d6dd692168c049196f54edc2e8227c60702bb1d2 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Fri, 5 Mar 2010 13:10:38 -0600 Subject: x86: Reduce per cpu warning boot up messages Reduce warning message output to one line only instead of per cpu. Signed-of-by: Mike Travis <travis@sgi.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Brian Gerst <brgerst@gmail.com> Cc: x86@kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c9b3522b6b46..4e8cb4ee9fcb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -600,7 +600,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk(KERN_WARNING "WARNING: polling idle and HT enabled," + printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," " performance may degrade.\n"); } #endif -- cgit v1.2.2 From 45e16a6834b6af098702e5ea6c9a40de42ff77d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Thu, 11 Mar 2010 13:40:30 +0100 Subject: perf, x86: Fix hw_perf_enable() event assignment What happens is that we schedule badly like: <...>-1987 [019] 280.252808: x86_pmu_start: event-46/1300c0: idx: 0 <...>-1987 [019] 280.252811: x86_pmu_start: event-47/1300c0: idx: 1 <...>-1987 [019] 280.252812: x86_pmu_start: event-48/1300c0: idx: 2 <...>-1987 [019] 280.252813: x86_pmu_start: event-49/1300c0: idx: 3 <...>-1987 [019] 280.252814: x86_pmu_start: event-50/1300c0: idx: 32 <...>-1987 [019] 280.252825: x86_pmu_stop: event-46/1300c0: idx: 0 <...>-1987 [019] 280.252826: x86_pmu_stop: event-47/1300c0: idx: 1 <...>-1987 [019] 280.252827: x86_pmu_stop: event-48/1300c0: idx: 2 <...>-1987 [019] 280.252828: x86_pmu_stop: event-49/1300c0: idx: 3 <...>-1987 [019] 280.252829: x86_pmu_stop: event-50/1300c0: idx: 32 <...>-1987 [019] 280.252834: x86_pmu_start: event-47/1300c0: idx: 1 <...>-1987 [019] 280.252834: x86_pmu_start: event-48/1300c0: idx: 2 <...>-1987 [019] 280.252835: x86_pmu_start: event-49/1300c0: idx: 3 <...>-1987 [019] 280.252836: x86_pmu_start: event-50/1300c0: idx: 32 <...>-1987 [019] 280.252837: x86_pmu_start: event-51/1300c0: idx: 32 *FAIL* This happens because we only iterate the n_running events in the first pass, and reset their index to -1 if they don't match to force a re-assignment. Now, in our RR example, n_running == 0 because we fully unscheduled, so event-50 will retain its idx==32, even though in scheduling it will have gotten idx=0, and we don't trigger the re-assign path. The easiest way to fix this is the below patch, which simply validates the full assignment in the second pass. Reported-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1268311069.5037.31.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c6bde7d7afdc..5fb490c6ee5c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -811,7 +811,6 @@ void hw_perf_enable(void) * step2: reprogram moved events into new counters */ for (i = 0; i < n_running; i++) { - event = cpuc->event_list[i]; hwc = &event->hw; @@ -826,21 +825,16 @@ void hw_perf_enable(void) continue; x86_pmu_stop(event); - - hwc->idx = -1; } for (i = 0; i < cpuc->n_events; i++) { - event = cpuc->event_list[i]; hwc = &event->hw; - if (i < n_running && - match_prev_assignment(hwc, cpuc, i)) - continue; - - if (hwc->idx == -1) + if (!match_prev_assignment(hwc, cpuc, i)) x86_assign_hw_event(event, cpuc, i); + else if (i < n_running) + continue; x86_pmu_start(event); } -- cgit v1.2.2 From 639fe4b12f92b54c9c3b38c82cdafaa38cfd3e63 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> Date: Thu, 11 Mar 2010 15:30:35 +0800 Subject: perf: export perf_trace_regs and perf_arch_fetch_caller_regs Export perf_trace_regs and perf_arch_fetch_caller_regs since module will use these. Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> [ use EXPORT_PER_CPU_SYMBOL_GPL() ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <4B989C1B.2090407@cn.fujitsu.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5fb490c6ee5c..7645faea8e85 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1713,3 +1713,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski regs->cs = __KERNEL_CS; local_save_flags(regs->flags); } +EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); -- cgit v1.2.2 From 8447b360a3897bdfb0677107564d1dd9ab6e63be Mon Sep 17 00:00:00 2001 From: Jack Steiner <steiner@sgi.com> Date: Thu, 11 Mar 2010 12:43:29 -0600 Subject: x86, UV: Fix target_cpus() in x2apic_uv_x.c target_cpu() should initially target all cpus, not just cpu 0. Otherwise systems with lots of disks can exhaust the interrupt vectors on cpu 0 if a large number of disks are discovered before the irq balancer is running. Note: UV code only... Signed-off-by: Jack Steiner <steiner@sgi.com> LKML-Reference: <20100311184328.GA21433@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/x2apic_uv_x.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 21db3cbea7dc..af0ca80e38a9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -114,11 +114,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - static const struct cpumask *uv_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) -- cgit v1.2.2 From 5d0e52830e9ae09b872567f4aca3dfb5b5918079 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 10 Mar 2010 15:21:13 -0800 Subject: Add generic sys_old_select() Add a generic implementation of the old select() syscall, which expects its argument in a memory block and switch all architectures over to use it. Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Reviewed-by: H. Peter Anvin <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: James Morris <jmorris@namei.org> Acked-by: Andreas Schwab <schwab@linux-m68k.org> Acked-by: Russell King <rmk+kernel@arm.linux.org.uk> Acked-by: Greg Ungerer <gerg@uclinux.org> Acked-by: David Howells <dhowells@redhat.com> Cc: Andreas Schwab <schwab@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/sys_i386_32.c | 17 ----------------- arch/x86/kernel/syscall_table_32.S | 2 +- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index dee1ff7cba58..345dbd19a2b3 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -58,23 +58,6 @@ out: return err; } - -struct sel_arg_struct { - unsigned long n; - fd_set __user *inp, *outp, *exp; - struct timeval __user *tvp; -}; - -asmlinkage int old_select(struct sel_arg_struct __user *arg) -{ - struct sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - /* sys_select() does the appropriate kernel locking */ - return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); -} - /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. * diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 15228b5d3eb7..4d10abacecdb 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -81,7 +81,7 @@ ENTRY(sys_call_table) .long sys_settimeofday .long sys_getgroups16 /* 80 */ .long sys_setgroups16 - .long old_select + .long sys_old_select .long sys_symlink .long sys_lstat .long sys_readlink /* 85 */ -- cgit v1.2.2 From a4679373cf4ee0e7792dc56205365732b725c2c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 10 Mar 2010 15:21:15 -0800 Subject: Add generic sys_old_mmap() Add a generic implementation of the old mmap() syscall, which expects its argument in a memory block and switch all architectures over to use it. Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Reviewed-by: H. Peter Anvin <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: James Morris <jmorris@namei.org> Cc: Andreas Schwab <schwab@linux-m68k.org> Acked-by: Jesper Nilsson <jesper.nilsson@axis.com> Acked-by: Russell King <rmk+kernel@arm.linux.org.uk> Acked-by: Greg Ungerer <gerg@uclinux.org> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/sys_i386_32.c | 34 ---------------------------------- arch/x86/kernel/syscall_table_32.S | 2 +- 2 files changed, 1 insertion(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 345dbd19a2b3..7955e90c8341 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,40 +24,6 @@ #include <asm/syscalls.h> -/* - * Perform the select(nd, in, out, ex, tv) and mmap() system - * calls. Linux/i386 didn't use to be able to handle more than - * 4 system call parameters, so these system calls used a memory - * block for parameter passing.. - */ - -struct mmap_arg_struct { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; -}; - -asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) -{ - struct mmap_arg_struct a; - int err = -EFAULT; - - if (copy_from_user(&a, arg, sizeof(a))) - goto out; - - err = -EINVAL; - if (a.offset & ~PAGE_MASK) - goto out; - - err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, - a.fd, a.offset >> PAGE_SHIFT); -out: - return err; -} - /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. * diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 4d10abacecdb..8b3729341216 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -89,7 +89,7 @@ ENTRY(sys_call_table) .long sys_swapon .long sys_reboot .long sys_old_readdir - .long old_mmap /* 90 */ + .long sys_old_mmap /* 90 */ .long sys_munmap .long sys_truncate .long sys_ftruncate -- cgit v1.2.2 From baed7fc9b580bd3fb8252ff1d9b36eaf1f86b670 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 10 Mar 2010 15:21:18 -0800 Subject: Add generic sys_ipc wrapper Add a generic implementation of the ipc demultiplexer syscall. Except for s390 and sparc64 all implementations of the sys_ipc are nearly identical. There are slight differences in the types of the parameters, where mips and powerpc as the only 64-bit architectures with sys_ipc use unsigned long for the "third" argument as it gets casted to a pointer later, while it traditionally is an "int" like most other paramters. frv goes even further and uses unsigned long for all parameters execept for "ptr" which is a pointer type everywhere. The change from int to unsigned long for "third" and back to "int" for the others on frv should be fine due to the in-register calling conventions for syscalls (we already had a similar issue with the generic sys_ptrace), but I'd prefer to have the arch maintainers looks over this in details. Except for that h8300, m68k and m68knommu lack an impplementation of the semtimedop sub call which this patch adds, and various architectures have gets used - at least on i386 it seems superflous as the compat code on x86-64 and ia64 doesn't even bother to implement it. [akpm@linux-foundation.org: add sys_ipc to sys_ni.c] Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Reviewed-by: H. Peter Anvin <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: James Morris <jmorris@namei.org> Cc: Andreas Schwab <schwab@linux-m68k.org> Acked-by: Jesper Nilsson <jesper.nilsson@axis.com> Acked-by: Russell King <rmk+kernel@arm.linux.org.uk> Acked-by: David Howells <dhowells@redhat.com> Acked-by: Kyle McMartin <kyle@mcmartin.ca> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/sys_i386_32.c | 85 ------------------------------------------- 1 file changed, 85 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 7955e90c8341..8b5c348fdcf2 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,91 +24,6 @@ #include <asm/syscalls.h> -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc(uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); - - case SEMGET: - return sys_semget(first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl(first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd(first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof(tmp))) - return -EFAULT; - return sys_msgrcv(first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv(first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget((key_t) first, second); - case MSGCTL: - return sys_msgctl(first, second, (struct msqid_ds __user *) ptr); - - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat(first, (char __user *) ptr, second, &raddr); - if (ret) - return ret; - return put_user(raddr, (ulong __user *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ - return do_shmat(first, (char __user *) ptr, second, (ulong *) third); - } - case SHMDT: - return sys_shmdt((char __user *)ptr); - case SHMGET: - return sys_shmget(first, second, third); - case SHMCTL: - return sys_shmctl(first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} - /* * Old cruft */ -- cgit v1.2.2 From e28cbf22933d0c0ccaf3c4c27a1a263b41f73859 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 10 Mar 2010 15:21:19 -0800 Subject: improve sys_newuname() for compat architectures On an architecture that supports 32-bit compat we need to override the reported machine in uname with the 32-bit value. Instead of doing this separately in every architecture introduce a COMPAT_UTS_MACHINE define in <asm/compat.h> and apply it directly in sys_newuname(). Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: James Morris <jmorris@namei.org> Cc: Andreas Schwab <schwab@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/sys_x86_64.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8aa2057efd12..ff14a5044ce6 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -209,15 +209,3 @@ bottomup: return addr; } - - -SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) -{ - int err; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - return err ? -EFAULT : 0; -} -- cgit v1.2.2 From 5cacdb4add1b1e50fe75edc50ebbb7bddd9cf5e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 10 Mar 2010 15:21:21 -0800 Subject: Add generic sys_olduname() Add generic implementations of the old and really old uname system calls. Note that sh only implements sys_olduname but not sys_oldolduname, but I'm not going to bother with another ifdef for that special case. m32r implemented an old uname but never wired it up, so kill it, too. Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: James Morris <jmorris@namei.org> Cc: Andreas Schwab <schwab@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/sys_i386_32.c | 49 ------------------------------------------- 1 file changed, 49 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 8b5c348fdcf2..196552bb412c 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,55 +24,6 @@ #include <asm/syscalls.h> -/* - * Old cruft - */ -asmlinkage int sys_uname(struct old_utsname __user *name) -{ - int err; - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - return err? -EFAULT:0; -} - -asmlinkage int sys_olduname(struct oldold_utsname __user *name) -{ - int error; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); - - up_read(&uts_sem); - - error = error ? -EFAULT : 0; - - return error; -} - - /* * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. -- cgit v1.2.2 From 0e152cd7c16832bd5cadee0c2e41d9959bc9b6f9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov <bp@amd64.org> Date: Fri, 12 Mar 2010 15:43:03 +0100 Subject: x86, k8 nb: Fix boot crash: enable k8_northbridges unconditionally on AMD systems de957628ce7c84764ff41331111036b3ae5bad0f changed setting of the x86_init.iommu.iommu_init function ptr only when GART IOMMU is found. One side effect of it is that num_k8_northbridges is not initialized anymore if not explicitly called. This resulted in uninitialized pointers in <arch/x86/kernel/cpu/intel_cacheinfo.c:amd_calc_l3_indices()>, for example, which uses the num_k8_northbridges thing through node_to_k8_nb_misc(). Fix that through an initcall that runs right after the PCI subsystem and does all the scanning. Then, remove initialization in gart_iommu_init() which is a rootfs_initcall and we're running before that. What is more, since num_k8_northbridges is being used in other places beside GART IOMMU, include it whenever we add AMD CPU support. The previous dependency chain in kconfig contained K8_NB depends on AGP_AMD64|GART_IOMMU which was clearly incorrect. The more natural way in terms of hardware dependency should be AGP_AMD64|GART_IOMMU depends on K8_NB depends on CPU_SUP_AMD && PCI. Make it so Number One! Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: Joerg Roedel <joerg.roedel@amd.com> LKML-Reference: <20100312144303.GA29262@aftab> Signed-off-by: Ingo Molnar <mingo@elte.hu> Tested-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/k8.c | 14 ++++++++++++++ arch/x86/kernel/pci-gart_64.c | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index cbc4332a77b2..9b895464dd03 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -121,3 +121,17 @@ void k8_flush_garts(void) } EXPORT_SYMBOL_GPL(k8_flush_garts); +static __init int init_k8_nbs(void) +{ + int err = 0; + + err = cache_k8_northbridges(); + + if (err < 0) + printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + + return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_k8_nbs); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 34de53b46f87..f3af115a573a 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -735,7 +735,7 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) + if (num_k8_northbridges == 0) return 0; #ifndef CONFIG_AGP_AMD64 -- cgit v1.2.2 From 2aa2b50dd62b5d0675bd7453fbeb5732dc2d7866 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Sun, 14 Mar 2010 08:57:03 +0100 Subject: x86/mce: Fix build bug with CONFIG_PROVE_LOCKING=y && CONFIG_X86_MCE_INTEL=y Commit f56e8a076 "x86/mce: Fix RCU lockdep splats" introduced the following build bug: arch/x86/kernel/cpu/mcheck/mce.c: In function 'mce_log': arch/x86/kernel/cpu/mcheck/mce.c:166: error: 'mce_read_mutex' undeclared (first use in this function) arch/x86/kernel/cpu/mcheck/mce.c:166: error: (Each undeclared identifier is reported only once arch/x86/kernel/cpu/mcheck/mce.c:166: error: for each function it appears in.) Move the in-the-middle-of-file lock variable up to the variable definition section, the top of the .c file. Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: x86@kernel.org Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267830207-9474-3-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index bd58de4d7a29..3ab9c886b613 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,8 @@ #include "mce-internal.h" +static DEFINE_MUTEX(mce_read_mutex); + #define rcu_dereference_check_mce(p) \ rcu_dereference_check((p), \ rcu_read_lock_sched_held() || \ @@ -1490,8 +1492,6 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } -static DEFINE_MUTEX(mce_read_mutex); - static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { -- cgit v1.2.2 From 8144c880397d502d12af4ef721f3eac50163fa39 Mon Sep 17 00:00:00 2001 From: Len Brown <len.brown@intel.com> Date: Thu, 18 Feb 2010 23:42:47 -0500 Subject: ACPI: remove "acpi=ht" DMI blacklist SuSE added these entries when deploying ACPI in Linux-2.4. I pulled them into Linux-2.6 on 2003-08-09. Over the last 6+ years, several entries have proven to be unnecessary and deleted, while no new entries have been added. Matthew suggests that they now have negative value, and I agree. Based-on-patch-by: Matthew Garrett <mjg59@srcf.ucam.org> Signed-off-by: Len Brown <len.brown@intel.com> --- arch/x86/kernel/acpi/boot.c | 93 --------------------------------------------- 1 file changed, 93 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a54d714545ff..df586bbc9447 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1292,23 +1292,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) return 0; } -/* - * Limit ACPI to CPU enumeration for HT - */ -static int __init force_acpi_ht(const struct dmi_system_id *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", - d->ident); - disable_acpi(); - acpi_ht = 1; - } else { - printk(KERN_NOTICE - "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); - } - return 0; -} - /* * Force ignoring BIOS IRQ0 pin2 override */ @@ -1344,82 +1327,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, }, - /* - * Boxes that need acpi=ht - */ - { - .callback = force_acpi_ht, - .ident = "FSC Primergy T850", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), - DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "HP VISUALIZE NT Workstation", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "Compaq Workstation W8000", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), - DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ASUS CUR-DLS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ABIT i440BX-W83977", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), - DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM Bladecenter", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eServer xSeries 360", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eserver xSeries 330", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eserver xSeries 440", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), - }, - }, - /* * Boxes that need ACPI PCI IRQ routing disabled */ -- cgit v1.2.2 From 4c81ba4900ab4eb24c7d2ba1aca594c644b6ce4c Mon Sep 17 00:00:00 2001 From: Len Brown <len.brown@intel.com> Date: Sun, 14 Mar 2010 16:28:46 -0400 Subject: ACPI: plan to delete "acpi=ht" boot option Signed-off-by: Len Brown <len.brown@intel.com> --- arch/x86/kernel/acpi/boot.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index df586bbc9447..7914ab0ad76e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1559,8 +1559,10 @@ static int __init parse_acpi(char *arg) } /* Limit ACPI just to boot-time to enable HT */ else if (strcmp(arg, "ht") == 0) { - if (!acpi_force) + if (!acpi_force) { + printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n"); disable_acpi(); + } acpi_ht = 1; } /* acpi=rsdt use RSDT instead of XSDT */ -- cgit v1.2.2 From d8191fa4a33fdc817277da4f2b7f771ff605a41c Mon Sep 17 00:00:00 2001 From: Alex Chiang <achiang@hp.com> Date: Mon, 22 Feb 2010 12:11:39 -0700 Subject: ACPI: processor: driver doesn't need to evaluate _PDC Now that the early _PDC evaluation path knows how to correctly evaluate _PDC on only physically present processors, there's no need for the processor driver to evaluate it later when it loads. To cover the hotplug case, push _PDC evaluation down into the hotplug paths. Cc: x86@kernel.org Cc: Tony Luck <tony.luck@intel.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Alex Chiang <achiang@hp.com> Signed-off-by: Len Brown <len.brown@intel.com> --- arch/x86/kernel/acpi/boot.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a54d714545ff..d635a93ae59c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -490,6 +490,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) * ACPI based hotplug support for CPU */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +#include <acpi/processor.h> static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { @@ -567,6 +568,8 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) goto free_new_map; } + acpi_processor_set_pdc(handle); + cpu = cpumask_first(new_map); acpi_map_cpu2node(handle, cpu, physid); -- cgit v1.2.2 From 36e9e1eab777e077f7484d309ff676d0568e27d1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Mon, 15 Mar 2010 14:33:06 -0800 Subject: x86: Handle legacy PIC interrupts on all the cpu's Ingo Molnar reported that with the recent changes of not statically blocking IRQ0_VECTOR..IRQ15_VECTOR's on all the cpu's, broke an AMD platform (with Nvidia chipset) boot when "noapic" boot option is used. On this platform, legacy PIC interrupts are getting delivered to all the cpu's instead of just the boot cpu. Thus not initializing the vector to irq mapping for the legacy irq's resulted in not handling certain interrupts causing boot hang. Fix this by initializing the vector to irq mapping on all the logical cpu's, if the legacy IRQ is handled by the legacy PIC. Reported-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> [ -v2: io-apic-enabled improvement ] Acked-by: Yinghai Lu <yinghai@kernel.org> Cc: Eric W. Biederman <ebiederm@xmission.com> LKML-Reference: <1268692386.3296.43.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/io_apic.c | 8 ++++++++ arch/x86/kernel/irqinit.c | 22 ++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 31 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e4e0ddcb1546..463de9a858ad 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1268,6 +1268,14 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; + + /* + * If it is a legacy IRQ handled by the legacy PIC, this cpu + * will be part of the irq_cfg's domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) + cpumask_set_cpu(cpu, cfg->domain); + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index ef257fc2921b..f01d390f9c5b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -141,6 +141,28 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } +/* + * Setup the vector to irq mappings. + */ +void setup_vector_irq(int cpu) +{ +#ifndef CONFIG_X86_IO_APIC + int irq; + + /* + * On most of the platforms, legacy PIC delivers the interrupts on the + * boot cpu. But there are certain platforms where PIC interrupts are + * delivered to multiple cpu's. If the legacy IRQ is handled by the + * legacy PIC, for the new cpu that is coming online, setup the static + * legacy vector to irq mapping: + */ + for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) + per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; +#endif + + __setup_vector_irq(cpu); +} + static void __init smp_intr_init(void) { #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a02e80c3c54b..06d98ae5a802 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -247,7 +247,7 @@ static void __cpuinit smp_callin(void) /* * Need to setup vector mappings before we enable interrupts. */ - __setup_vector_irq(smp_processor_id()); + setup_vector_irq(smp_processor_id()); /* * Get our bogomips. * -- cgit v1.2.2 From dcd5c1662db59a6b82942f47fb6ac9dd63f6d3dd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Tue, 16 Mar 2010 01:05:02 +0100 Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs perf_arch_fetch_caller_regs() is exported for the overriden x86 version, but not for the generic weak version. As a general rule, weak functions should not have their symbol exported in the same file they are defined. So let's export it on trace_event_perf.c as it is used by trace events only. This fixes: ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined! ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined! -v2: And also only build it if trace events are enabled. -v3: Fix changelog mistake Reported-by: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> Cc: Paul Mackerras <paulus@samba.org> LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 7645faea8e85..60398a0d947c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1702,6 +1702,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } +#ifdef CONFIG_EVENT_TRACING void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { regs->ip = ip; @@ -1713,4 +1714,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski regs->cs = __KERNEL_CS; local_save_flags(regs->flags); } -EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); +#endif -- cgit v1.2.2 From 035a02c1e1de31888e8b6adac0ff667971ac04db Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 19 Mar 2010 12:09:22 +0100 Subject: x86, amd: Restrict usage of c1e_idle() Currently c1e_idle returns true for all CPUs greater than or equal to family 0xf model 0x40. This covers too many CPUs. Meanwhile a respective erratum for the underlying problem was filed (#400). This patch adds the logic to check whether erratum #400 applies to a given CPU. Especially for CPUs where SMI/HW triggered C1e is not supported, c1e_idle() doesn't need to be used. We can check this by looking at the respective OSVW bit for erratum #400. Cc: <stable@kernel.org> # .32.x .33.x Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> LKML-Reference: <20100319110922.GA19614@alberich.amd.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/process.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ad9540676fcc..28ad9f4d8b94 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -526,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) } /* - * Check for AMD CPUs, which have potentially C1E support + * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. + * For more information see + * - Erratum #400 for NPT family 0xf and family 0x10 CPUs + * - Erratum #365 for family 0x11 (not affected because C1e not in use) */ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) { + u64 val; if (c->x86_vendor != X86_VENDOR_AMD) - return 0; - - if (c->x86 < 0x0F) - return 0; + goto no_c1e_idle; /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; + if (c->x86 == 0x0F && c->x86_model >= 0x40) + return 1; - return 1; + if (c->x86 == 0x10) { + /* + * check OSVW bit for CPUs that are not affected + * by erratum #400 + */ + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); + if (val >= 2) { + rdmsrl(MSR_AMD64_OSVW_STATUS, val); + if (!(val & BIT(1))) + goto no_c1e_idle; + } + return 1; + } + +no_c1e_idle: + return 0; } static cpumask_var_t c1e_mask; -- cgit v1.2.2 From a90110c61073eab95d1986322693c2b9a8a6a5f6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" <rjw@sisk.pl> Date: Sun, 21 Mar 2010 21:51:51 +0100 Subject: x86 / perf: Fix suspend to RAM on HP nx6325 Commit 3f6da3905398826d85731247e7fbcf53400c18bd (perf: Rework and fix the arch CPU-hotplug hooks) broke suspend to RAM on my HP nx6325 (and most likely on other AMD-based boxes too) by allowing amd_pmu_cpu_offline() to be executed for CPUs that are going offline as part of the suspend process. The problem is that cpuhw->amd_nb may be NULL already, so the function should make sure it's not NULL before accessing the object pointed to by it. Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/cpu/perf_event_amd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 573458f1caf2..b87e0b6970cb 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -348,10 +348,12 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_lock(&amd_nb_lock); - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); + if (cpuhw->amd_nb) { + if (--cpuhw->amd_nb->refcnt == 0) + kfree(cpuhw->amd_nb); - cpuhw->amd_nb = NULL; + cpuhw->amd_nb = NULL; + } raw_spin_unlock(&amd_nb_lock); } -- cgit v1.2.2 From 596b711ed6b5235f8545680ef38ace00f9898c32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Sun, 28 Mar 2010 19:42:54 -0700 Subject: x86: Make smp_locks end with page alignment Fix: ------------[ cut here ]------------ WARNING: at arch/x86/mm/init.c:342 free_init_pages+0x4c/0xfa() free_init_pages: range [0x40daf000, 0x40db5c24] is not aligned Modules linked in: Pid: 0, comm: swapper Not tainted 2.6.34-rc2-tip-03946-g4f16b23-dirty #50 Call Trace: [<40232e9f>] warn_slowpath_common+0x65/0x7c [<4021c9f0>] ? free_init_pages+0x4c/0xfa [<40881434>] ? _etext+0x0/0x24 [<40232eea>] warn_slowpath_fmt+0x24/0x27 [<4021c9f0>] free_init_pages+0x4c/0xfa [<40881434>] ? _etext+0x0/0x24 [<40d3f4bd>] alternative_instructions+0xf6/0x100 [<40d3fe4f>] check_bugs+0xbd/0xbf [<40d398a7>] start_kernel+0x2d5/0x2e4 [<40d390ce>] i386_start_kernel+0xce/0xd5 ---[ end trace 4eaa2a86a8e2da22 ]--- Comments in vmlinux.lds.S already said: | /* | * smp_locks might be freed after init | * start/end must be page aligned | */ Signed-off-by: Yinghai Lu <yinghai@kernel.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: David Miller <davem@davemloft.net> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> LKML-Reference: <1269830604-26214-2-git-send-email-yinghai@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 44879df55696..2cc249718c46 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,8 +291,8 @@ SECTIONS .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) - __smp_locks_end = .; . = ALIGN(PAGE_SIZE); + __smp_locks_end = .; } #ifdef CONFIG_X86_64 -- cgit v1.2.2 From c967da6a0ba837f762042e931d4afcf72045547c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Sun, 28 Mar 2010 19:42:55 -0700 Subject: x86: Make sure free_init_pages() frees pages on page boundary When CONFIG_NO_BOOTMEM=y, it could use memory more effiently, or in a more compact fashion. Example: Allocated new RAMDISK: 00ec2000 - 0248ce57 Move RAMDISK from 000000002ea04000 - 000000002ffcee56 to 00ec2000 - 0248ce56 The new RAMDISK's end is not page aligned. Last page could be shared with other users. When free_init_pages are called for initrd or .init, the page could be freed and we could corrupt other data. code segment in free_init_pages(): | for (; addr < end; addr += PAGE_SIZE) { | ClearPageReserved(virt_to_page(addr)); | init_page_count(virt_to_page(addr)); | memset((void *)(addr & ~(PAGE_SIZE-1)), | POISON_FREE_INITMEM, PAGE_SIZE); | free_page(addr); | totalram_pages++; | } last half page could be used as one whole free page. So page align the boundaries. -v2: make the original initramdisk to be aligned, according to Johannes, otherwise we have the chance to lose one page. we still need to keep initrd_end not aligned, otherwise it could confuse decompressor. -v3: change to WARN_ON instead, suggested by Johannes. -v4: use PAGE_ALIGN, suggested by Johannes. We may fix that macro name later to PAGE_ALIGN_UP, and PAGE_ALIGN_DOWN Add comments about assuming ramdisk start is aligned in relocate_initrd(), change to re get ramdisk_image instead of save it to make diff smaller. Add warning for wrong range, suggested by Johannes. -v6: remove one WARN() We need to align beginning in free_init_pages() do not copy more than ramdisk_size, noticed by Johannes Reported-by: Stanislaw Gruszka <sgruszka@redhat.com> Tested-by: Stanislaw Gruszka <sgruszka@redhat.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: David Miller <davem@davemloft.net> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> LKML-Reference: <1269830604-26214-3-git-send-email-yinghai@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head32.c | 4 +++- arch/x86/kernel/head64.c | 3 ++- arch/x86/kernel/setup.c | 10 ++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index adedeef1dedc..b2e246037392 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/start_kernel.h> +#include <linux/mm.h> #include <asm/setup.h> #include <asm/sections.h> @@ -44,9 +45,10 @@ void __init i386_start_kernel(void) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b5a9896ca1e7..7147143fd614 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d7ba1a449bd..d76e18570c60 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -314,16 +314,17 @@ static void __init reserve_brk(void) #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { - + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 area_size = PAGE_ALIGN(ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, + ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, PAGE_SIZE); if (ramdisk_here == -1ULL) @@ -332,7 +333,7 @@ static void __init relocate_initrd(void) /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, + reserve_early(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -376,9 +377,10 @@ static void __init relocate_initrd(void) static void __init reserve_initrd(void) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || -- cgit v1.2.2 From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 24 Mar 2010 17:04:11 +0900 Subject: include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> --- arch/x86/kernel/acpi/boot.c | 1 + arch/x86/kernel/alternative.c | 1 + arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- arch/x86/kernel/apb_timer.c | 1 + arch/x86/kernel/apic/es7000_32.c | 1 + arch/x86/kernel/apic/io_apic.c | 1 + arch/x86/kernel/apic/nmi.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + arch/x86/kernel/bootflag.c | 1 - arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 1 + arch/x86/kernel/cpu/cpufreq/elanfreq.c | 1 - arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 1 + arch/x86/kernel/cpu/cpufreq/longrun.c | 1 - arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 - arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 1 + arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 1 - arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 1 + arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 1 - arch/x86/kernel/cpu/cpufreq/speedstep-lib.c | 1 - arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 1 - arch/x86/kernel/cpu/mcheck/mce-inject.c | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 1 + arch/x86/kernel/cpu/mcheck/mce_amd.c | 1 + arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + arch/x86/kernel/cpu/mtrr/generic.c | 1 - arch/x86/kernel/cpu/mtrr/if.c | 1 + arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpuid.c | 1 + arch/x86/kernel/crash_dump_32.c | 1 + arch/x86/kernel/hpet.c | 1 + arch/x86/kernel/i387.c | 1 + arch/x86/kernel/i8259.c | 1 - arch/x86/kernel/irqinit.c | 1 - arch/x86/kernel/k8.c | 2 +- arch/x86/kernel/kdebugfs.c | 1 + arch/x86/kernel/ldt.c | 1 + arch/x86/kernel/machine_kexec_64.c | 1 + arch/x86/kernel/mca_32.c | 1 + arch/x86/kernel/module.c | 1 + arch/x86/kernel/msr.c | 1 + arch/x86/kernel/pci-dma.c | 1 + arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/pci-nommu.c | 1 + arch/x86/kernel/ptrace.c | 1 + arch/x86/kernel/setup.c | 1 - arch/x86/kernel/smp.c | 1 + arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/tlb_uv.c | 1 + arch/x86/kernel/uv_irq.c | 1 + arch/x86/kernel/uv_time.c | 1 + arch/x86/kernel/vmi_32.c | 1 + 52 files changed, 40 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0061ea263061..cd40aba6aa95 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,6 +31,7 @@ #include <linux/module.h> #include <linux/dmi.h> #include <linux/irq.h> +#include <linux/slab.h> #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/pci.h> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3a4bf35c179b..1a160d5d44d0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -8,6 +8,7 @@ #include <linux/vmalloc.h> #include <linux/memory.h> #include <linux/stop_machine.h> +#include <linux/slab.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index adb0ba025702..f3dadb571d9b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -18,8 +18,8 @@ */ #include <linux/pci.h> -#include <linux/gfp.h> #include <linux/bitmap.h> +#include <linux/slab.h> #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/dma-mapping.h> diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9dc91b431470..42f5350b908f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -19,8 +19,8 @@ #include <linux/pci.h> #include <linux/acpi.h> -#include <linux/gfp.h> #include <linux/list.h> +#include <linux/slab.h> #include <linux/sysdev.h> #include <linux/interrupt.h> #include <linux/msi.h> diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 4b7099526d2c..ff469e470059 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -33,6 +33,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/sysdev.h> +#include <linux/slab.h> #include <linux/pm.h> #include <linux/pci.h> #include <linux/sfi.h> diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index dd2b5f264643..03ba1b895f5e 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -42,6 +42,7 @@ #include <linux/errno.h> #include <linux/acpi.h> #include <linux/init.h> +#include <linux/gfp.h> #include <linux/nmi.h> #include <linux/smp.h> #include <linux/io.h> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 463de9a858ad..127b8718abfb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -36,6 +36,7 @@ #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/jiffies.h> /* time_after() */ +#include <linux/slab.h> #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 8aa65adbd25d..1edaf15c0b8e 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -18,6 +18,7 @@ #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/sysdev.h> #include <linux/sysctl.h> #include <linux/percpu.h> diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 49dbeaef2a27..c085d52dbaf2 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -17,6 +17,7 @@ #include <linux/ctype.h> #include <linux/sched.h> #include <linux/timer.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/init.h> #include <linux/io.h> diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 30f25a75fe28..5de7f4c56971 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -5,7 +5,6 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/acpi.h> #include <asm/io.h> diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1b1920fa7c80..459168083b77 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <trace/events/power.h> #include <linux/acpi.h> diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 006b278b0d5d..c587db472a75 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -20,7 +20,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/cpufreq.h> diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index ac27ec2264d5..16e3483be9e3 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -80,6 +80,7 @@ #include <linux/cpufreq.h> #include <linux/pci.h> #include <linux/errno.h> +#include <linux/slab.h> #include <asm/processor-cyrix.h> diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index da5f70fcb766..e7b559d74c52 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/cpufreq.h> #include <linux/timex.h> diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 869615193720..7b8a8ba67b07 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -25,7 +25,6 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/cpumask.h> #include <linux/timex.h> diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index ff36d2979a90..ce7cde713e71 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/cpufreq.h> #include <linux/compiler.h> +#include <linux/slab.h> #include <linux/acpi.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index cb01dac267d3..b3379d6a5c57 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -13,7 +13,6 @@ #include <linux/init.h> #include <linux/cpufreq.h> #include <linux/ioport.h> -#include <linux/slab.h> #include <linux/timex.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 8d672ef162ce..9b1ff37de46a 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -20,6 +20,7 @@ #include <linux/sched.h> /* current */ #include <linux/delay.h> #include <linux/compiler.h> +#include <linux/gfp.h> #include <asm/msr.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 2ce8e0b5cc54..561758e95180 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -23,7 +23,6 @@ #include <linux/init.h> #include <linux/cpufreq.h> #include <linux/pci.h> -#include <linux/slab.h> #include <linux/sched.h> #include "speedstep-lib.h" diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index ad0083abfa23..a94ec6be69fa 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -13,7 +13,6 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <asm/msr.h> #include <asm/tsc.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index 04d73c114e49..8abd869baabf 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -17,7 +17,6 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/io.h> #include <asm/ist.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 73734baa50f2..e7dbde7bfedb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -22,6 +22,7 @@ #include <linux/kdebug.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/gfp.h> #include <asm/mce.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3ab9c886b613..8a6f0afa767e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -26,6 +26,7 @@ #include <linux/sched.h> #include <linux/sysfs.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/poll.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index cda932ca3ade..224392d8fe8c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -21,6 +21,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/sysfs.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index d15df6e49bf0..62b48e40920a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -5,6 +5,7 @@ * Author: Andi Kleen */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9aa5dc76ff4a..fd31a441c61c 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -6,7 +6,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/io.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index e006e56f699c..79289632cb27 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/string.h> +#include <linux/slab.h> #include <linux/init.h> #define LINE_SIZE 80 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 60398a0d947c..0316ffe851bd 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -21,6 +21,7 @@ #include <linux/kdebug.h> #include <linux/sched.h> #include <linux/uaccess.h> +#include <linux/slab.h> #include <linux/highmem.h> #include <linux/cpu.h> #include <linux/bitops.h> diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 83e5e628de73..8b862d5900fe 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,6 +40,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/uaccess.h> +#include <linux/gfp.h> #include <asm/processor.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index cd97ce18c29d..67414550c3cc 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -5,6 +5,7 @@ * Copyright (C) IBM Corporation, 2004. All rights reserved */ +#include <linux/slab.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/crash_dump.h> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ee4fa1bfcb33..d10a7e7294f4 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -4,6 +4,7 @@ #include <linux/sysdev.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/slab.h> #include <linux/hpet.h> #include <linux/init.h> #include <linux/cpu.h> diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index c01a2b846d47..54c31c285488 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/regset.h> #include <linux/sched.h> +#include <linux/slab.h> #include <asm/sigcontext.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index fb725ee15f55..7c9f02c130f3 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -5,7 +5,6 @@ #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/timex.h> -#include <linux/slab.h> #include <linux/random.h> #include <linux/init.h> #include <linux/kernel_stat.h> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f01d390f9c5b..0ed2d300cd46 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -5,7 +5,6 @@ #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/timex.h> -#include <linux/slab.h> #include <linux/random.h> #include <linux/kprobes.h> #include <linux/init.h> diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index 9b895464dd03..0f7bc20cfcde 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -2,8 +2,8 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ -#include <linux/gfp.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/module.h> diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index e444357375ce..8afd9f321f10 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/stat.h> #include <linux/io.h> diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ec6ef60cbd17..ea697263b373 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -7,6 +7,7 @@ */ #include <linux/errno.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 4a8bb82248ae..035c8c529181 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> +#include <linux/gfp.h> #include <linux/reboot.h> #include <linux/numa.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 845d80ce1ef1..63eaf6596233 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -42,6 +42,7 @@ #include <linux/kernel.h> #include <linux/mca.h> #include <linux/kprobes.h> +#include <linux/slab.h> #include <asm/system.h> #include <asm/io.h> #include <linux/proc_fs.h> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 89f386f044e4..e0bc186d7501 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -23,6 +23,7 @@ #include <linux/kernel.h> #include <linux/bug.h> #include <linux/mm.h> +#include <linux/gfp.h> #include <asm/system.h> #include <asm/page.h> diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 206735ac8cbd..4d4468e9f47c 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -37,6 +37,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/uaccess.h> +#include <linux/gfp.h> #include <asm/processor.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a4ac764a6880..4b7e3d8b01dd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -2,6 +2,7 @@ #include <linux/dma-debug.h> #include <linux/dmar.h> #include <linux/bootmem.h> +#include <linux/gfp.h> #include <linux/pci.h> #include <linux/kmemleak.h> diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index f3af115a573a..68cd24f9deae 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -29,6 +29,7 @@ #include <linux/iommu-helper.h> #include <linux/sysdev.h> #include <linux/io.h> +#include <linux/gfp.h> #include <asm/atomic.h> #include <asm/mtrr.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 22be12b60a8f..3af4af810c07 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -4,6 +4,7 @@ #include <linux/scatterlist.h> #include <linux/string.h> #include <linux/init.h> +#include <linux/gfp.h> #include <linux/pci.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a503b1fd04e5..2e9b55027b7e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> +#include <linux/slab.h> #include <linux/ptrace.h> #include <linux/regset.h> #include <linux/tracehook.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d7ba1a449bd..c08d1e3261a8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -55,7 +55,6 @@ #include <linux/stddef.h> #include <linux/unistd.h> #include <linux/ptrace.h> -#include <linux/slab.h> #include <linux/user.h> #include <linux/delay.h> diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index ec1de97600e7..d801210945d6 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -21,6 +21,7 @@ #include <linux/cache.h> #include <linux/interrupt.h> #include <linux/cpu.h> +#include <linux/gfp.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 06d98ae5a802..be40f82b09af 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -49,6 +49,7 @@ #include <linux/nmi.h> #include <linux/tboot.h> #include <linux/stackprotector.h> +#include <linux/gfp.h> #include <asm/acpi.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 364d015efebc..17b03dd3a6b5 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <asm/mmu_context.h> #include <asm/uv/uv.h> diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index ece73d8e3240..1d40336b030a 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/rbtree.h> +#include <linux/slab.h> #include <linux/irq.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 2b75ef638dbc..56e421bc379b 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -19,6 +19,7 @@ * Copyright (c) Dimitri Sivanich */ #include <linux/clockchips.h> +#include <linux/slab.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 7dd599deca4a..ce9fbacb7526 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -28,6 +28,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/gfp.h> #include <asm/vmi.h> #include <asm/io.h> #include <asm/fixmap.h> -- cgit v1.2.2 From 9f3a5f52aa63d3aa4c64a7245153549bb66bad8c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Mon, 29 Mar 2010 22:38:29 -0700 Subject: x86: Make e820_remove_range to handle all covered case Rusty found on lguest with trim_bios_range, max_pfn is not right anymore, and looks e820_remove_range does not work right. [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] LGUEST: 0000000000000000 - 0000000004000000 (usable) [ 0.000000] Notice: NX (Execute Disable) protection missing in CPU or disabled in BIOS! [ 0.000000] DMI not present or invalid. [ 0.000000] last_pfn = 0x3fa0 max_arch_pfn = 0x100000 [ 0.000000] init_memory_mapping: 0000000000000000-0000000003fa0000 root cause is: the e820_remove_range doesn't handle the all covered case. e820_remove_range(BIOS_START, BIOS_END - BIOS_START, ...) produces a bogus range as a result. Make it match e820_update_range() by handling that case too. Reported-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Tested-by: Rusty Russell <rusty@rustcorp.com.au> LKML-Reference: <4BB18E55.6090903@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/e820.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 740b440fbd73..7bca3c6a02fb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -519,29 +519,45 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", (unsigned long long) start, (unsigned long long) end); - e820_print_type(old_type); + if (checktype) + e820_print_type(old_type); printk(KERN_CONT "\n"); for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; + u64 ei_end; if (checktype && ei->type != old_type) continue; + + ei_end = ei->addr + ei->size; /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { + if (ei->addr >= start && ei_end <= end) { real_removed_size += ei->size; memset(ei, 0, sizeof(struct e820entry)); continue; } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + e820_add_region(end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_removed_size += size; + continue; + } + /* partially covered */ final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); + final_end = min(end, ei_end); if (final_start >= final_end) continue; real_removed_size += final_end - final_start; + /* + * left range could be head or tail, so need to update + * size at first. + */ ei->size -= final_end - final_start; if (ei->addr < final_start) continue; -- cgit v1.2.2 From e49a5bd38159dfb1928fd25b173bc9de4bbadb21 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Mon, 22 Mar 2010 19:40:03 +0100 Subject: perf: Use hot regs with software sched switch/migrate events Scheduler's task migration events don't work because they always pass NULL regs perf_sw_event(). The event hence gets filtered in perf_swevent_add(). Scheduler's context switches events use task_pt_regs() to get the context when the event occured which is a wrong thing to do as this won't give us the place in the kernel where we went to sleep but the place where we left userspace. The result is even more wrong if we switch from a kernel thread. Use the hot regs snapshot for both events as they belong to the non-interrupt/exception based events family. Unlike page faults or so that provide the regs matching the exact origin of the event, we need to save the current context. This makes the task migration event working and fix the context switch callchains and origin ip. Example: perf record -a -e cs Before: 10.91% ksoftirqd/0 0 [k] 0000000000000000 | --- (nil) perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_swevent_ctx_event do_perf_sw_event __perf_sw_event perf_event_task_sched_out schedule run_ksoftirqd kthread kernel_thread_helper After: 23.77% hald-addon-stor [kernel.kallsyms] [k] schedule | --- schedule | |--60.00%-- schedule_timeout | wait_for_common | wait_for_completion | blk_execute_rq | scsi_execute | scsi_execute_req | sr_test_unit_ready | | | |--66.67%-- sr_media_change | | media_changed | | cdrom_media_changed | | sr_block_media_changed | | check_disk_change | | cdrom_open v2: Always build perf_arch_fetch_caller_regs() now that software events need that too. They don't need it from modules, unlike trace events, so we keep the EXPORT_SYMBOL in trace_event_perf.c Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: David Miller <davem@davemloft.net> --- arch/x86/kernel/cpu/perf_event.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 60398a0d947c..5fb490c6ee5c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1702,7 +1702,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -#ifdef CONFIG_EVENT_TRACING void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { regs->ip = ip; @@ -1714,4 +1713,3 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski regs->cs = __KERNEL_CS; local_save_flags(regs->flags); } -#endif -- cgit v1.2.2 From ab310b5edb8b601bcb02491ed6f7676da4fd1757 Mon Sep 17 00:00:00 2001 From: Jason Wessel <jason.wessel@windriver.com> Date: Tue, 30 Mar 2010 14:05:07 -0500 Subject: x86,kgdb: Always initialize the hw breakpoint attribute It is required to call hw_breakpoint_init() on an attr before using it in any other calls. This fixes the problem where kgdb will sometimes fail to initialize on x86_64. Signed-off-by: Jason Wessel <jason.wessel@windriver.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: 2.6.33 <stable@kernel.org> LKML-Reference: <1269975907-27602-1-git-send-email-jason.wessel@windriver.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> --- arch/x86/kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index bfba6019d762..b2258ca91003 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -618,8 +618,8 @@ int kgdb_arch_init(void) * portion of kgdb because this operation requires mutexs to * complete. */ + hw_breakpoint_init(&attr); attr.bp_addr = (unsigned long)kgdb_arch_init; - attr.type = PERF_TYPE_BREAKPOINT; attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; -- cgit v1.2.2 From 909fc87b32b3b9e3f0b87dcc5d98319c41900c58 Mon Sep 17 00:00:00 2001 From: Andi Kleen <andi@firstfloor.org> Date: Mon, 29 Mar 2010 09:41:11 +0200 Subject: x86: Handle overlapping mptables We found a system where the MP table MPC and MPF structures overlap. That doesn't really matter because the mptable is not used anyways with ACPI, but it leads to a panic in the early allocator due to the overlapping reservations in 2.6.33. Earlier kernels handled this without problems. Simply change these reservations to reserve_early_overlap_ok to avoid the panic. Reported-by: Thomas Renninger <trenn@suse.de> Tested-by: Thomas Renninger <trenn@suse.de> Signed-off-by: Andi Kleen <ak@linux.intel.com> LKML-Reference: <20100329074111.GA22821@basil.fritz.box> Signed-off-by: H. Peter Anvin <hpa@zytor.com> Cc: <stable@kernel.org> --- arch/x86/kernel/mpparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index a2c1edd2d3ac..e81030f71a8f 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -664,7 +664,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); - reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); + reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); } static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -693,7 +693,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); + reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); if (mpf->physptr) smp_reserve_memory(mpf); -- cgit v1.2.2 From 8da854cb02156c90028233ae1e85ce46a1d3f82c Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com> Date: Thu, 25 Feb 2010 10:53:48 -0800 Subject: x86, hpet: Erratum workaround for read after write of HPET comparator On Wed, Feb 24, 2010 at 03:37:04PM -0800, Justin Piszcz wrote: > Hello, > > Again, on the Intel DP55KG board: > > # uname -a > Linux host 2.6.33 #1 SMP Wed Feb 24 18:31:00 EST 2010 x86_64 GNU/Linux > > [ 1.237600] ------------[ cut here ]------------ > [ 1.237890] WARNING: at arch/x86/kernel/hpet.c:404 hpet_next_event+0x70/0x80() > [ 1.238221] Hardware name: > [ 1.238504] hpet: compare register read back failed. > [ 1.238793] Modules linked in: > [ 1.239315] Pid: 0, comm: swapper Not tainted 2.6.33 #1 > [ 1.239605] Call Trace: > [ 1.239886] <IRQ> [<ffffffff81056c13>] ? warn_slowpath_common+0x73/0xb0 > [ 1.240409] [<ffffffff81079608>] ? tick_dev_program_event+0x38/0xc0 > [ 1.240699] [<ffffffff81056cb0>] ? warn_slowpath_fmt+0x40/0x50 > [ 1.240992] [<ffffffff81079608>] ? tick_dev_program_event+0x38/0xc0 > [ 1.241281] [<ffffffff81041ad0>] ? hpet_next_event+0x70/0x80 > [ 1.241573] [<ffffffff81079608>] ? tick_dev_program_event+0x38/0xc0 > [ 1.241859] [<ffffffff81078e32>] ? tick_handle_oneshot_broadcast+0xe2/0x100 > [ 1.246533] [<ffffffff8102a67a>] ? timer_interrupt+0x1a/0x30 > [ 1.246826] [<ffffffff81085499>] ? handle_IRQ_event+0x39/0xd0 > [ 1.247118] [<ffffffff81087368>] ? handle_edge_irq+0xb8/0x160 > [ 1.247407] [<ffffffff81029f55>] ? handle_irq+0x15/0x20 > [ 1.247689] [<ffffffff810294a2>] ? do_IRQ+0x62/0xe0 > [ 1.247976] [<ffffffff8146be53>] ? ret_from_intr+0x0/0xa > [ 1.248262] <EOI> [<ffffffff8102f277>] ? mwait_idle+0x57/0x80 > [ 1.248796] [<ffffffff8102645c>] ? cpu_idle+0x5c/0xb0 > [ 1.249080] ---[ end trace db7f668fb6fef4e1 ]--- > > Is this something Intel has to fix or is it a bug in the kernel? This is a chipset erratum. Thomas: You mentioned we can retain this check only for known-buggy and hpet debug kind of options. But here is the simple workaround patch for this particular erratum. Some chipsets have a erratum due to which read immediately following a write of HPET comparator returns old comparator value instead of most recently written value. Erratum 15 in "Intel I/O Controller Hub 9 (ICH9) Family Specification Update" (http://www.intel.com/assets/pdf/specupdate/316973.pdf) Workaround for the errata is to read the comparator twice if the first one fails. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> LKML-Reference: <20100225185348.GA9674@linux-os.sc.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@gmail.com> Cc: <stable@kernel.org> --- arch/x86/kernel/hpet.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ee4fa1bfcb33..3d422da92100 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -399,9 +399,15 @@ static int hpet_next_event(unsigned long delta, * then we might have a real hardware problem. We can not do * much about it here, but at least alert the user/admin with * a prominent warning. + * An erratum on some chipsets (ICH9,..), results in comparator read + * immediately following a write returning old value. Workaround + * for this is to read this value second time, when first + * read returns old value. */ - WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, + if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { + WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, KERN_WARNING "hpet: compare register read back failed.\n"); + } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } -- cgit v1.2.2 From b4a5e8a1deca7e61ebaffb37344766b0f0e9f327 Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Thu, 11 Mar 2010 14:00:16 -0800 Subject: x86, hpet: Fix bug in RTC emulation We think there exists a bug in the HPET code that emulates the RTC. In the normal case, when the RTC frequency is set, the rtc driver tells the hpet code about it here: int hpet_set_periodic_freq(unsigned long freq) { uint64_t clc; if (!is_hpet_enabled()) return 0; if (freq <= DEFAULT_RTC_INT_FREQ) hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; else { clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; do_div(clc, freq); clc >>= hpet_clockevent.shift; hpet_pie_delta = (unsigned long) clc; } return 1; } If freq is set to 64Hz (DEFAULT_RTC_INT_FREQ) or lower, then hpet_pie_limit (a static) is set to non-zero. Then, on every one-shot HPET interrupt, hpet_rtc_timer_reinit is called to compute the next timeout. Well, that function has this logic: if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) delta = hpet_default_delta; else delta = hpet_pie_delta; Since hpet_pie_limit is not 0, hpet_default_delta is used. That corresponds to 64Hz. Now, if you set a different rtc frequency, you'll take the else path through hpet_set_periodic_freq, but unfortunately no one resets hpet_pie_limit back to 0. Boom....now you are stuck with 64Hz RTC interrupts forever. The patch below just resets the hpet_pie_limit value when requested freq is greater than DEFAULT_RTC_INT_FREQ, which we think fixes this problem. Signed-off-by: Alok N Kataria <akataria@vmware.com> LKML-Reference: <201003112200.o2BM0Hre012875@imap1.linux-foundation.org> Signed-off-by: Daniel Hecht <dhecht@vmware.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/hpet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3d422da92100..2bda5f0052f7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1149,6 +1149,7 @@ int hpet_set_periodic_freq(unsigned long freq) do_div(clc, freq); clc >>= hpet_clockevent.shift; hpet_pie_delta = clc; + hpet_pie_limit = 0; } return 1; } -- cgit v1.2.2 From 042be38e6106ed70b42d096ab4a1ed4187e510e6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Thu, 1 Apr 2010 14:32:43 -0700 Subject: ibft, x86: Change reserve_ibft_region() to find_ibft_region() This allows arch code could decide the way to reserve the ibft. And we should reserve ibft as early as possible, instead of BOOTMEM stage, in case the table is in RAM range and is not reserved by BIOS (this will often be the case.) Move to just after find_smp_config(). Also when CONFIG_NO_BOOTMEM=y, We will not have reserve_bootmem() anymore. -v2: fix typo about ibft pointed by Konrad Rzeszutek Wilk <konrad@darnok.org> Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <4BB510FB.80601@kernel.org> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Peter Jones <pjones@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad@kernel.org> CC: Jan Beulich <jbeulich@novell.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/setup.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d76e18570c60..580e6b3dbdb8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -608,6 +608,16 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif +static __init void reserve_ibft_region(void) +{ + unsigned long addr, size = 0; + + addr = find_ibft_region(&size); + + if (size) + reserve_early_overlap_ok(addr, addr + size, "ibft"); +} + #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -910,6 +920,8 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); + reserve_ibft_region(); + reserve_trampoline_memory(); #ifdef CONFIG_ACPI_SLEEP @@ -977,8 +989,6 @@ void __init setup_arch(char **cmdline_p) dma32_reserve_bootmem(); - reserve_ibft_region(); - #ifdef CONFIG_KVM_CLOCK kvmclock_init(); #endif -- cgit v1.2.2 From 85257024096a96fc5c00ce59d685f62bbed3ad95 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Tue, 23 Mar 2010 19:30:52 +0100 Subject: x86: Move notify_cpu_starting() callback to a later stage Because we need to have cpu identification things done by the time we run CPU_STARTING notifiers. ( This init ordering will be relied on by the next fix. ) Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1269353485.5109.48.camel@twins> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 06d98ae5a802..6808b934d6c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -242,8 +242,6 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); - notify_cpu_starting(cpuid); - /* * Need to setup vector mappings before we enable interrupts. */ @@ -264,6 +262,8 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + notify_cpu_starting(cpuid); + /* * Allow the master to continue. */ -- cgit v1.2.2 From b38b24ead33417146e051453d04bf60b8d2d7e25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <a.p.zijlstra@chello.nl> Date: Tue, 23 Mar 2010 19:31:15 +0100 Subject: perf, x86: Fix AMD hotplug & constraint initialization Commit 3f6da39 ("perf: Rework and fix the arch CPU-hotplug hooks") moved the amd northbridge allocation from CPUS_ONLINE to CPUS_PREPARE_UP however amd_nb_id() doesn't work yet on prepare so it would simply bail basically reverting to a state where we do not properly track node wide constraints - causing weird perf results. Fix up the AMD NorthBridge initialization code by allocating from CPU_UP_PREPARE and installing it from CPU_STARTING once we have the proper nb_id. It also properly deals with the allocation failing. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> [ robustify using amd_has_nb() ] Signed-off-by: Stephane Eranian <eranian@google.com> LKML-Reference: <1269353485.5109.48.camel@twins> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 8 ++-- arch/x86/kernel/cpu/perf_event_amd.c | 80 +++++++++++++++++++++--------------- 2 files changed, 52 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5fb490c6ee5c..bd28cf9d8a82 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -158,7 +158,7 @@ struct x86_pmu { struct perf_event *event); struct event_constraint *event_constraints; - void (*cpu_prepare)(int cpu); + int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); @@ -1333,11 +1333,12 @@ static int __cpuinit x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; + int ret = NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: if (x86_pmu.cpu_prepare) - x86_pmu.cpu_prepare(cpu); + ret = x86_pmu.cpu_prepare(cpu); break; case CPU_STARTING: @@ -1350,6 +1351,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) x86_pmu.cpu_dying(cpu); break; + case CPU_UP_CANCELED: case CPU_DEAD: if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); @@ -1359,7 +1361,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; } - return NOTIFY_OK; + return ret; } static void __init pmu_check_apic(void) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index b87e0b6970cb..db6f7d4056e1 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -137,6 +137,13 @@ static inline int amd_is_nb_event(struct hw_perf_event *hwc) return (hwc->config & 0xe0) == 0xe0; } +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + static void amd_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -147,7 +154,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, /* * only care about NB events */ - if (!(nb && amd_is_nb_event(hwc))) + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) return; /* @@ -214,7 +221,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) /* * if not NB event or no NB, then no constraints */ - if (!(nb && amd_is_nb_event(hwc))) + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) return &unconstrained; /* @@ -293,51 +300,55 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) return nb; } -static void amd_pmu_cpu_online(int cpu) +static int amd_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + WARN_ON_ONCE(cpuc->amd_nb); + + if (boot_cpu_data.x86_max_cores < 2) + return NOTIFY_OK; + + cpuc->amd_nb = amd_alloc_nb(cpu, -1); + if (!cpuc->amd_nb) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void amd_pmu_cpu_starting(int cpu) { - struct cpu_hw_events *cpu1, *cpu2; - struct amd_nb *nb = NULL; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct amd_nb *nb; int i, nb_id; if (boot_cpu_data.x86_max_cores < 2) return; - /* - * function may be called too early in the - * boot process, in which case nb_id is bogus - */ nb_id = amd_get_nb_id(cpu); - if (nb_id == BAD_APICID) - return; - - cpu1 = &per_cpu(cpu_hw_events, cpu); - cpu1->amd_nb = NULL; + WARN_ON_ONCE(nb_id == BAD_APICID); raw_spin_lock(&amd_nb_lock); for_each_online_cpu(i) { - cpu2 = &per_cpu(cpu_hw_events, i); - nb = cpu2->amd_nb; - if (!nb) + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) continue; - if (nb->nb_id == nb_id) - goto found; - } - nb = amd_alloc_nb(cpu, nb_id); - if (!nb) { - pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); - raw_spin_unlock(&amd_nb_lock); - return; + if (nb->nb_id == nb_id) { + kfree(cpuc->amd_nb); + cpuc->amd_nb = nb; + break; + } } -found: - nb->refcnt++; - cpu1->amd_nb = nb; + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; raw_spin_unlock(&amd_nb_lock); } -static void amd_pmu_cpu_offline(int cpu) +static void amd_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuhw; @@ -349,8 +360,10 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_lock(&amd_nb_lock); if (cpuhw->amd_nb) { - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); cpuhw->amd_nb = NULL; } @@ -379,8 +392,9 @@ static __initconst struct x86_pmu amd_pmu = { .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, - .cpu_prepare = amd_pmu_cpu_online, - .cpu_dead = amd_pmu_cpu_offline, + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, }; static __init int amd_pmu_init(void) -- cgit v1.2.2 From 257ef9d21f1b008a6c7425544b36641c4325a922 Mon Sep 17 00:00:00 2001 From: Torok Edwin <edwintorok@gmail.com> Date: Wed, 17 Mar 2010 12:07:16 +0200 Subject: perf, x86: Fix callgraphs of 32-bit processes on 64-bit kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When profiling a 32-bit process on a 64-bit kernel, callgraph tracing stopped after the first function, because it has seen a garbage memory address (tried to interpret the frame pointer, and return address as a 64-bit pointer). Fix this by using a struct stack_frame with 32-bit pointers when the TIF_IA32 flag is set. Note that TIF_IA32 flag must be used, and not is_compat_task(), because the latter is only set when the 32-bit process is executing a syscall, which may not always be the case (when tracing page fault events for example). Signed-off-by: Török Edwin <edwintorok@gmail.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Paul Mackerras <paulus@samba.org> Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org LKML-Reference: <1268820436-13145-1-git-send-email-edwintorok@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event.c | 44 +++++++++++++++++++++++++++++++++++----- arch/x86/kernel/dumpstack.h | 5 +++++ 2 files changed, 44 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bd28cf9d8a82..53ea4cf1a878 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -28,6 +28,7 @@ #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> +#include <asm/compat.h> static u64 perf_event_mask __read_mostly; @@ -1630,14 +1631,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +#ifdef CONFIG_COMPAT +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bytes; + /* 32-bit process in 64-bit kernel. */ + struct stack_frame_ia32 frame; + const void __user *fp; + + if (!test_thread_flag(TIF_IA32)) + return 0; + + fp = compat_ptr(regs->bp); + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = 0; + frame.return_address = 0; - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) + break; + + if (fp < compat_ptr(regs->sp)) + break; - return bytes == sizeof(*frame); + callchain_store(entry, frame.return_address); + fp = compat_ptr(frame.next_frame); + } + return 1; } +#else +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + return 0; +} +#endif static void perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) @@ -1653,11 +1682,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); + if (perf_callchain_user32(regs, entry)) + return; + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) break; if ((unsigned long)fp < regs->sp) diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 29e5f7c845b2..e39e77168a37 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -30,6 +30,11 @@ struct stack_frame { unsigned long return_address; }; +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + static inline unsigned long rewind_frame_pointer(int n) { struct stack_frame *frame; -- cgit v1.2.2 From 472a474c6630efd195d3738339fd1bdc8aa3b1aa Mon Sep 17 00:00:00 2001 From: Suresh Siddha <suresh.b.siddha@intel.com> Date: Wed, 31 Mar 2010 18:04:47 -0700 Subject: x86: Fix double enable_IR_x2apic() call on SMP kernel on !SMP boards Jan Grossmann reported kernel boot panic while booting SMP kernel on his system with a single core cpu. SMP kernels call enable_IR_x2apic() from native_smp_prepare_cpus() and on platforms where the kernel doesn't find SMP configuration we ended up again calling enable_IR_x2apic() from the APIC_init_uniprocessor() call in the smp_sanity_check(). Thus leading to kernel panic. Don't call enable_IR_x2apic() and default_setup_apic_routing() from APIC_init_uniprocessor() in CONFIG_SMP case. NOTE: this kind of non-idempotent and assymetric initialization sequence is rather fragile and unclean, we'll clean that up in v2.6.35. This is the minimal fix for v2.6.34. Reported-by: Jan.Grossmann@kielnet.net Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: <jbarnes@virtuousgeek.org> Cc: <david.woodhouse@intel.com> Cc: <weidong.han@intel.com> Cc: <youquan.song@intel.com> Cc: <Jan.Grossmann@kielnet.net> Cc: <stable@kernel.org> # [v2.6.32.x, v2.6.33.x] LKML-Reference: <1270083887.7835.78.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic/apic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 00187f1fcfb7..e5a4a1e01618 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1640,8 +1640,10 @@ int __init APIC_init_uniprocessor(void) } #endif +#ifndef CONFIG_SMP enable_IR_x2apic(); default_setup_apic_routing(); +#endif verify_local_APIC(); connect_bsp_APIC(); -- cgit v1.2.2 From 134fbadf028a5977a1b06b0253d3ee33e6f0c642 Mon Sep 17 00:00:00 2001 From: Vince Weaver <vweaver1@eecs.utk.edu> Date: Tue, 6 Apr 2010 10:01:19 -0400 Subject: perf, x86: Enable Nehalem-EX support According to Intel Software Devel Manual Volume 3B, the Nehalem-EX PMU is just like regular Nehalem (except for the uncore support, which is completely different). Signed-off-by: Vince Weaver <vweaver1@eecs.utk.edu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Lin Ming <ming.m.lin@intel.com> LKML-Reference: <alpine.DEB.2.00.1004060956580.1417@cl320.eecs.utk.edu> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perf_event_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 84bfde64a337..9c794ac87837 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -936,6 +936,7 @@ static __init int intel_pmu_init(void) case 26: /* 45 nm nehalem, "Bloomfield" */ case 30: /* 45 nm nehalem, "Lynnfield" */ + case 46: /* 45 nm nehalem-ex, "Beckton" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.2.2 From 75f66533bc883f761a7adcab3281fe3323efbc90 Mon Sep 17 00:00:00 2001 From: Chris Wright <chrisw@sous-sol.org> Date: Fri, 2 Apr 2010 18:27:52 -0700 Subject: x86/amd-iommu: enable iommu before attaching devices Hit another kdump problem as reported by Neil Horman. When initializaing the IOMMU, we attach devices to their domains before the IOMMU is fully (re)initialized. Attaching a device will issue some important invalidations. In the context of the newly kexec'd kdump kernel, the IOMMU may have stale cached data from the original kernel. Because we do the attach too early, the invalidation commands are placed in the new command buffer before the IOMMU is updated w/ that buffer. This leaves the stale entries in the kdump context and can renders device unusable. Simply enable the IOMMU before we do the attach. Cc: stable@kernel.org Cc: Neil Horman <nhorman@tuxdriver.com> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu_init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index feaf47184900..8975965f3e67 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1304,6 +1304,8 @@ static int __init amd_iommu_init(void) if (ret) goto free; + enable_iommus(); + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else @@ -1316,8 +1318,6 @@ static int __init amd_iommu_init(void) amd_iommu_init_notifier(); - enable_iommus(); - if (iommu_pass_through) goto out; @@ -1331,6 +1331,7 @@ out: return ret; free: + disable_iommus(); amd_iommu_uninit_devices(); -- cgit v1.2.2 From 549c90dc9a6d659e792b2a42a0930c7da015ea4a Mon Sep 17 00:00:00 2001 From: Chris Wright <chrisw@sous-sol.org> Date: Fri, 2 Apr 2010 18:27:53 -0700 Subject: x86/amd-iommu: warn when issuing command to uninitialized cmd buffer To catch future potential issues we can add a warning whenever we issue a command before the command buffer is fully initialized. Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 1 + arch/x86/kernel/amd_iommu_init.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b06f29e275e9..71dfc0af8e50 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) u32 tail, head; u8 *target; + WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8975965f3e67..5edf41c7127c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -438,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) if (cmd_buf == NULL) return NULL; - iommu->cmd_buf_size = CMD_BUFFER_SIZE; + iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED; return cmd_buf; } @@ -474,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) &entry, sizeof(entry)); amd_iommu_reset_cmd_buffer(iommu); + iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED); } static void __init free_command_buffer(struct amd_iommu *iommu) { free_pages((unsigned long)iommu->cmd_buf, - get_order(iommu->cmd_buf_size)); + get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED))); } /* allocates the memory where the IOMMU will log its events to */ -- cgit v1.2.2 From 8f9f55e83e939724490d7cde3833c4883c6d1310 Mon Sep 17 00:00:00 2001 From: Chris Wright <chrisw@sous-sol.org> Date: Fri, 2 Apr 2010 18:27:54 -0700 Subject: Revert "x86: disable IOMMUs on kernel crash" This effectively reverts commit 61d047be99757fd9b0af900d7abce9a13a337488. Disabling the IOMMU can potetially allow DMA transactions to complete without being translated. Leave it enabled, and allow crash kernel to do the IOMMU reinitialization properly. Cc: stable@kernel.org Cc: Joerg Roedel <joerg.roedel@amd.com> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Neil Horman <nhorman@tuxdriver.com> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/crash.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a4849c10a77e..ebd4c51d096a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,7 +27,6 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> -#include <asm/x86_init.h> #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif - -#ifdef CONFIG_X86_64 - x86_platform.iommu_shutdown(); -#endif - crash_save_cpu(regs, safe_smp_processor_id()); } -- cgit v1.2.2 From d18c69d3898985c66cd6e878b8f576fd9a21ab39 Mon Sep 17 00:00:00 2001 From: Chris Wright <chrisw@sous-sol.org> Date: Fri, 2 Apr 2010 18:27:55 -0700 Subject: x86/amd-iommu: use for_each_pci_dev Replace open coded version with for_each_pci_dev Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 71dfc0af8e50..494956813951 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2187,7 +2187,7 @@ static void prealloc_protection_domains(void) struct dma_ops_domain *dma_dom; u16 devid; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) { /* Do we handle this device? */ if (!check_device(&dev->dev)) -- cgit v1.2.2 From 4b83873d3da0704987cb116833818ed96214ee29 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Wed, 7 Apr 2010 12:57:35 +0200 Subject: x86/gart: Disable GART explicitly before initialization If we boot into a crash-kernel the gart might still be enabled and its caches might be dirty. This can result in undefined behavior later. Fix it by explicitly disabling the gart hardware before initialization and flushing the caches after enablement. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> --- arch/x86/kernel/aperture_64.c | 15 ++++++++++++++- arch/x86/kernel/pci-gart_64.c | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3704997e8b25..b5d8b0bcf235 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -393,6 +393,7 @@ void __init gart_iommu_hole_init(void) for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; int dev_base, dev_limit; + u32 ctl; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -406,7 +407,19 @@ void __init gart_iommu_hole_init(void) gart_iommu_aperture = 1; x86_init.iommu.iommu_init = gart_iommu_init; - aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); + + /* + * Before we do anything else disable the GART. It may + * still be enabled if we boot into a crash-kernel here. + * Reconfiguring the GART while it is enabled could have + * unknown side-effects. + */ + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + + aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index f3af115a573a..0ae24d9b44b3 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -564,6 +564,9 @@ static void enable_gart_translations(void) enable_gart_translation(dev, __pa(agp_gatt_table)); } + + /* Flush the GART-TLB to remove stale entries */ + k8_flush_garts(); } /* -- cgit v1.2.2 From ab285f2b5290d92b7ec1a6f9aad54308dadf6157 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Thu, 8 Apr 2010 14:05:50 +0200 Subject: perf: Fix unsafe frame rewinding with hot regs fetching When we fetch the hot regs and rewind to the nth caller, it might happen that we dereference a frame pointer outside the kernel stack boundaries, like in this example: perf_trace_sched_switch+0xd5/0x120 schedule+0x6b5/0x860 retint_careful+0xd/0x21 Since we directly dereference a userspace frame pointer here while rewinding behind retint_careful, this may end up in a crash. Fix this by simply using probe_kernel_address() when we rewind the frame pointer. This issue will have a much more proper fix in the next version of the perf_arch_fetch_caller_regs() API that will only need to rewind to the first caller. Reported-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Tested-by: Eric Dumazet <eric.dumazet@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: David Miller <davem@davemloft.net> Cc: Archs <linux-arch@vger.kernel.org> --- arch/x86/kernel/dumpstack.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index e39e77168a37..e1a93be4fd44 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -14,6 +14,8 @@ #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif +#include <linux/uaccess.h> + extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl); @@ -42,8 +44,10 @@ static inline unsigned long rewind_frame_pointer(int n) get_bp(frame); #ifdef CONFIG_FRAME_POINTER - while (n--) - frame = frame->next_frame; + while (n--) { + if (probe_kernel_address(&frame->next_frame, frame)) + break; + } #endif return (unsigned long)frame; -- cgit v1.2.2 From 453dc65931915abc61f92e12bba1fc4747ff5542 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov <dtor@vmware.com> Date: Fri, 23 Apr 2010 13:18:08 -0400 Subject: VMware Balloon driver This is a standalone version of VMware Balloon driver. Ballooning is a technique that allows hypervisor dynamically limit the amount of memory available to the guest (with guest cooperation). In the overcommit scenario, when hypervisor set detects that it needs to shuffle some memory, it instructs the driver to allocate certain number of pages, and the underlying memory gets returned to the hypervisor. Later hypervisor may return memory to the guest by reattaching memory to the pageframes and instructing the driver to "deflate" balloon. We are submitting a standalone driver because KVM maintainer (Avi Kivity) expressed opinion (rightly) that our transport does not fit well into virtqueue paradigm and thus it does not make much sense to integrate with virtio. There were also some concerns whether current ballooning technique is the right thing. If there appears a better framework to achieve this we are prepared to evaluate and switch to using it, but in the meantime we'd like to get this driver upstream. We want to get the driver accepted in distributions so that users do not have to deal with an out-of-tree module and many distributions have "upstream first" requirement. The driver has been shipping for a number of years and users running on VMware platform will have it installed as part of VMware Tools even if it will not come from a distribution, thus there should not be additional risk in pulling the driver into mainline. The driver will only activate if host is VMware so everyone else should not be affected at all. Signed-off-by: Dmitry Torokhov <dtor@vmware.com> Cc: Avi Kivity <avi@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/cpu/vmware.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 1cbed97b59cf..dfdb4dba2320 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -22,6 +22,7 @@ */ #include <linux/dmi.h> +#include <linux/module.h> #include <asm/div64.h> #include <asm/vmware.h> #include <asm/x86_init.h> @@ -101,6 +102,7 @@ int vmware_platform(void) return 0; } +EXPORT_SYMBOL(vmware_platform); /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. -- cgit v1.2.2 From 402af0d7c692ddcfa2333e93d3f275ebd0487926 Mon Sep 17 00:00:00 2001 From: Jan Beulich <JBeulich@novell.com> Date: Wed, 21 Apr 2010 15:21:51 +0100 Subject: x86, asm: Introduce and use percpu_inc() ... generating slightly smaller code. Signed-off-by: Jan Beulich <jbeulich@novell.com> LKML-Reference: <4BCF261F020000780003B33C@vpn.id2.novell.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8a6f0afa767e..7a355ddcc64b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - __get_cpu_var(mce_poll_count)++; + percpu_inc(mce_poll_count); mce_setup(&m); @@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); - __get_cpu_var(mce_exception_count)++; + percpu_inc(mce_exception_count); if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) -- cgit v1.2.2 From 5967ed87ade85a421ef814296c3c7f182b08c225 Mon Sep 17 00:00:00 2001 From: Jan Beulich <JBeulich@novell.com> Date: Wed, 21 Apr 2010 16:08:14 +0100 Subject: x86-64: Reduce SMP locks table size Reduce the SMP locks table size by using relative pointers instead of absolute ones, thus cutting the table size by half. Signed-off-by: Jan Beulich <jbeulich@novell.com> LKML-Reference: <4BCF30FE020000780003B3B6@vpn.id2.novell.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/alternative.c | 45 ++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1a160d5d44d0..936738427223 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern u8 *__smp_locks[], *__smp_locks_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; static void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -235,37 +235,39 @@ void __init_or_module apply_alternatives(struct alt_instr *start, #ifdef CONFIG_SMP -static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) +static void alternatives_smp_lock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) { - u8 **ptr; + const s32 *poff; mutex_lock(&text_mutex); - for (ptr = start; ptr < end; ptr++) { - if (*ptr < text) - continue; - if (*ptr > text_end) + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) continue; /* turn DS segment override prefix into lock prefix */ - text_poke(*ptr, ((unsigned char []){0xf0}), 1); + text_poke(ptr, ((unsigned char []){0xf0}), 1); }; mutex_unlock(&text_mutex); } -static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) +static void alternatives_smp_unlock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) { - u8 **ptr; + const s32 *poff; if (noreplace_smp) return; mutex_lock(&text_mutex); - for (ptr = start; ptr < end; ptr++) { - if (*ptr < text) - continue; - if (*ptr > text_end) + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) continue; /* turn lock prefix into DS segment override prefix */ - text_poke(*ptr, ((unsigned char []){0x3E}), 1); + text_poke(ptr, ((unsigned char []){0x3E}), 1); }; mutex_unlock(&text_mutex); } @@ -276,8 +278,8 @@ struct smp_alt_module { char *name; /* ptrs to lock prefixes */ - u8 **locks; - u8 **locks_end; + const s32 *locks; + const s32 *locks_end; /* .text segment, needed to avoid patching init code ;) */ u8 *text; @@ -398,16 +400,19 @@ void alternatives_smp_switch(int smp) int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; - u8 **ptr; + const s32 *poff; u8 *text_start = start; u8 *text_end = end; list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; - for (ptr = mod->locks; ptr < mod->locks_end; ptr++) - if (text_start <= *ptr && text_end >= *ptr) + for (poff = mod->locks; poff < mod->locks_end; poff++) { + const u8 *ptr = (const u8 *)poff + *poff; + + if (text_start <= ptr && text_end > ptr) return 1; + } } return 0; -- cgit v1.2.2