From c41b93fb8551148a93d3bba870365e8489317f02 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 8 Feb 2011 23:41:35 +0100 Subject: ACPI / PM: Drop acpi_restore_state_mem() The function acpi_restore_state_mem() has never been and most likely never will be used, so remove it. Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/acpi.h | 3 +-- arch/x86/kernel/acpi/sleep.c | 8 -------- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 211ca3f7fd16..47981f0d121b 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -112,9 +112,8 @@ static inline void acpi_disable_pci(void) acpi_noirq_set(); } -/* routines for saving/restoring kernel state */ +/* Routine for saving kernel state during suspend. */ extern int acpi_save_state_mem(void); -extern void acpi_restore_state_mem(void); extern unsigned long acpi_wakeup_address; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 68d1537b8c81..c27a483094b1 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -110,14 +110,6 @@ int acpi_save_state_mem(void) return 0; } -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - - /** * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation * -- cgit v1.2.2 From f1a2003e22f6b50ea21f7f4b38b38c5ebc9c8017 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 8 Feb 2011 23:42:22 +0100 Subject: ACPI / PM: Merge do_suspend_lowlevel() into acpi_save_state_mem() The function do_suspend_lowlevel() is specific to x86 and defined in assembly code, so it should be called from the x86 low-level suspend code rather than from acpi_suspend_enter(). Merge do_suspend_lowlevel() into the x86's acpi_save_state_mem() and change the name of the latter to acpi_suspend_lowlevel(), so that the function's purpose is better reflected by its name. Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/acpi.h | 4 ++-- arch/x86/kernel/acpi/sleep.c | 5 +++-- arch/x86/kernel/acpi/sleep.h | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 47981f0d121b..aa92684aa674 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -112,8 +112,8 @@ static inline void acpi_disable_pci(void) acpi_noirq_set(); } -/* Routine for saving kernel state during suspend. */ -extern int acpi_save_state_mem(void); +/* Low-level suspend routine. */ +extern int acpi_suspend_lowlevel(void); extern unsigned long acpi_wakeup_address; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index c27a483094b1..5f1b747f6ef1 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -29,14 +29,14 @@ static char temp_stack[4096]; #endif /** - * acpi_save_state_mem - save kernel state + * acpi_suspend_lowlevel - save kernel state * * Create an identity mapped page table and copy the wakeup routine to * low memory. * * Note that this is too late to change acpi_wakeup_address. */ -int acpi_save_state_mem(void) +int acpi_suspend_lowlevel(void) { struct wakeup_header *header; @@ -107,6 +107,7 @@ int acpi_save_state_mem(void) saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ + do_suspend_lowlevel(); return 0; } diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index adbcbaa6f1df..31ce13f20297 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -14,3 +14,5 @@ extern char swsusp_pg_dir[PAGE_SIZE]; extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); -- cgit v1.2.2 From fc66c5210ec2539e800e87d7b3a985323c7be96e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Sat, 19 Mar 2011 18:20:05 +0100 Subject: perf, x86: Fix Intel fixed counters base initialization The following patch solves the problems introduced by Robert's commit 41bf498 and reported by Arun Sharma. This commit gets rid of the base + index notation for reading and writing PMU msrs. The problem is that for fixed counters, the new calculation for the base did not take into account the fixed counter indexes, thus all fixed counters were read/written from fixed counter 0. Although all fixed counters share the same config MSR, they each have their own counter register. Without: $ task -e unhalted_core_cycles -e instructions_retired -e baclears noploop 1 noploop for 1 seconds 242202299 unhalted_core_cycles (0.00% scaling, ena=1000790892, run=1000790892) 2389685946 instructions_retired (0.00% scaling, ena=1000790892, run=1000790892) 49473 baclears (0.00% scaling, ena=1000790892, run=1000790892) With: $ task -e unhalted_core_cycles -e instructions_retired -e baclears noploop 1 noploop for 1 seconds 2392703238 unhalted_core_cycles (0.00% scaling, ena=1000840809, run=1000840809) 2389793744 instructions_retired (0.00% scaling, ena=1000840809, run=1000840809) 47863 baclears (0.00% scaling, ena=1000840809, run=1000840809) Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ming.m.lin@intel.com Cc: robert.richter@amd.com Cc: asharma@fb.com Cc: perfmon2-devel@lists.sf.net LKML-Reference: <20110319172005.GB4978@quad> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e8dbe179587f..ec46eea0c4ed 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -912,7 +912,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->event_base = 0; } else if (hwc->idx >= X86_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); -- cgit v1.2.2 From 885b976fada5bc6595a9fd3e67e3cb1a3d11f50b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 21 Feb 2011 13:54:41 +0800 Subject: ACPI, APEI, Add ERST record ID cache APEI ERST firmware interface and implementation has no multiple users in mind. For example, if there is four records in storage with ID: 1, 2, 3 and 4, if two ERST readers enumerate the records via GET_NEXT_RECORD_ID as follow, reader 1 reader 2 1 2 3 4 -1 -1 where -1 signals there is no more record ID. Reader 1 has no chance to check record 2 and 4, while reader 2 has no chance to check record 1 and 3. And any other GET_NEXT_RECORD_ID will return -1, that is, other readers will has no chance to check any record even they are not cleared by anyone. This makes raw GET_NEXT_RECORD_ID not suitable for used by multiple users. To solve the issue, an in-memory ERST record ID cache is designed and implemented. When enumerating record ID, the ID returned by GET_NEXT_RECORD_ID is added into cache in addition to be returned to caller. So other readers can check the cache to get all record ID available. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Signed-off-by: Len Brown --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 42 ++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472b27a5..83930deec3c6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ -- cgit v1.2.2 From cbb84c4cc1ad0ab8faaffd899ccc9b14a88c91be Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 22 Mar 2011 15:24:54 +0600 Subject: x86, mpparse: Move check_slot into CONFIG_X86_IO_APIC context When CONFIG_X86_MPPARSE=y and CONFIG_X86_IO_APIC=n, then we get the following warning: arch/x86/kernel/mpparse.c:723: warning: 'check_slot' defined but not used So, put check_slot into CONFIG_X86_IO_APIC context. Its only called from CONFIG_X86_IO_APIC=y context. Signed-off-by: Rakib Mullick LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6f789a887c06..5a532ce646bf 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -714,10 +714,6 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) *nr_m_spare += 1; } } -#else /* CONFIG_X86_IO_APIC */ -static -inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} -#endif /* CONFIG_X86_IO_APIC */ static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) @@ -731,6 +727,10 @@ check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) return ret; } +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ static int __init replace_intsrc_all(struct mpc_table *mpc, unsigned long mpc_new_phys, -- cgit v1.2.2 From 1310e6d638b302bd9cd064f8de7dcd546bb7f597 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Thu, 17 Feb 2011 19:07:36 -0800 Subject: mfd: Add sharing for cs5535 acpi/pms cells This enables sharing of cs5535-mfd cells via the new mfd_shared_* API. Hooks for enable/disble of resources are added, with refcounting of resources being automatically handled so that cs5535_mfd_res_enable/disable are only called when necessary. Clients of cs5535-mfd (in this case, olpc-xo1.c) are also modified to use the mfd_shared API. The platform drivers are also renamed to olpc-xo1-{pms,acpi}, and resource enabling/disabling is replaced with mfd_shared API calls. Signed-off-by: Andres Salomon Signed-off-by: Samuel Ortiz --- arch/x86/platform/olpc/olpc-xo1.c | 42 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index 127775696d6c..f4155354a7b0 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -56,25 +57,24 @@ static void xo1_power_off(void) static int __devinit olpc_xo1_probe(struct platform_device *pdev) { struct resource *res; + int err; /* don't run on non-XOs */ if (!machine_is_olpc()) return -ENODEV; + err = mfd_shared_cell_enable(pdev); + if (err) + return err; + res = platform_get_resource(pdev, IORESOURCE_IO, 0); if (!res) { dev_err(&pdev->dev, "can't fetch device resource info\n"); return -EIO; } - - if (!request_region(res->start, resource_size(res), DRV_NAME)) { - dev_err(&pdev->dev, "can't request region\n"); - return -EIO; - } - - if (strcmp(pdev->name, "cs5535-pms") == 0) + if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = res->start; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0) acpi_base = res->start; /* If we have both addresses, we can override the poweroff hook */ @@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev) static int __devexit olpc_xo1_remove(struct platform_device *pdev) { - struct resource *r; - - r = platform_get_resource(pdev, IORESOURCE_IO, 0); - release_region(r->start, resource_size(r)); + mfd_shared_cell_disable(pdev); - if (strcmp(pdev->name, "cs5535-pms") == 0) + if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = 0; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0) acpi_base = 0; pm_power_off = NULL; @@ -104,7 +101,7 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev) static struct platform_driver cs5535_pms_drv = { .driver = { - .name = "cs5535-pms", + .name = "olpc-xo1-pms", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = { static struct platform_driver cs5535_acpi_drv = { .driver = { - .name = "cs5535-acpi", + .name = "olpc-xo1-acpi", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -124,26 +121,27 @@ static int __init olpc_xo1_init(void) { int r; - r = platform_driver_register(&cs5535_pms_drv); + r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms"); if (r) return r; - r = platform_driver_register(&cs5535_acpi_drv); + r = mfd_shared_platform_driver_register(&cs5535_acpi_drv, + "cs5535-acpi"); if (r) - platform_driver_unregister(&cs5535_pms_drv); + mfd_shared_platform_driver_unregister(&cs5535_pms_drv); return r; } static void __exit olpc_xo1_exit(void) { - platform_driver_unregister(&cs5535_acpi_drv); - platform_driver_unregister(&cs5535_pms_drv); + mfd_shared_platform_driver_unregister(&cs5535_acpi_drv); + mfd_shared_platform_driver_unregister(&cs5535_pms_drv); } MODULE_AUTHOR("Daniel Drake "); MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:olpc-xo1"); +MODULE_ALIAS("platform:cs5535-pms"); module_init(olpc_xo1_init); module_exit(olpc_xo1_exit); -- cgit v1.2.2 From f77289ac25b0c81acbed6f9c17cb14809a04e18b Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Thu, 3 Mar 2011 09:51:58 -0800 Subject: mfd: Rename mfd_shared_cell_{en,dis}able to drop the "shared" part As requested by Samuel, there's not really any reason to have "shared" in the name. This also modifies the only user of the function, as well. Signed-off-by: Andres Salomon Signed-off-by: Samuel Ortiz --- arch/x86/platform/olpc/olpc-xo1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index f4155354a7b0..99513642a0e6 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -63,7 +63,7 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev) if (!machine_is_olpc()) return -ENODEV; - err = mfd_shared_cell_enable(pdev); + err = mfd_cell_enable(pdev); if (err) return err; @@ -88,7 +88,7 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev) static int __devexit olpc_xo1_remove(struct platform_device *pdev) { - mfd_shared_cell_disable(pdev); + mfd_cell_disable(pdev); if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = 0; -- cgit v1.2.2 From c2ef45df3b98a027ec8f9081bd2a19dff520ef9d Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:13 -0400 Subject: x86: add context tag to mark mm when running a task in 32-bit compatibility mode This tag is intended to mirror the thread info TIF_IA32 flag. Will be used to identify mm's which support 32 bit tasks running in compatibility mode without requiring a reference to the task itself. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- arch/x86/include/asm/mmu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 80a1dee5bea5..aeff3e89b222 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -13,6 +13,12 @@ typedef struct { int size; struct mutex lock; void *vdso; + +#ifdef CONFIG_X86_64 + /* True if mm supports a task running in 32 bit compatibility mode. */ + unsigned short ia32_compat; +#endif + } mm_context_t; #ifdef CONFIG_SMP -- cgit v1.2.2 From 375906f8765e131a4a159b1ffebf78c15db7b3bf Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:14 -0400 Subject: x86: mark associated mm when running a task in 32 bit compatibility mode This patch simply follows the same practice as for setting the TIF_IA32 flag. In particular, an mm is marked as holding 32-bit tasks when a 32-bit binary is exec'ed. Both ELF and a.out formats are updated. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- arch/x86/ia32/ia32_aout.c | 1 + arch/x86/kernel/process_64.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2d93bdbc9ac0..fd843877e841 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); + current->mm->context.ia32_compat = 1; setup_new_exec(bprm); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bd387e8f73b4..6c9dd922ac0d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -501,6 +501,10 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that @@ -516,6 +520,10 @@ void set_personality_ia32(void) set_thread_flag(TIF_IA32); current->personality |= force_personality32; + /* Mark the associated mm as containing 32-bit tasks. */ + if (current->mm) + current->mm->context.ia32_compat = 1; + /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; } -- cgit v1.2.2 From 31db58b3ab432f72ea76be58b12e6ffaf627d5db Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:15 -0400 Subject: mm: arch: make get_gate_vma take an mm_struct instead of a task_struct Morally, the presence of a gate vma is more an attribute of a particular mm than a particular task. Moreover, dropping the dependency on task_struct will help make both existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- arch/x86/mm/init_64.c | 6 +++--- arch/x86/vdso/vdso32-setup.c | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0aa34669ed3f..dd4809b58441 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -861,10 +861,10 @@ static struct vm_area_struct gate_vma = { .vm_flags = VM_READ | VM_EXEC }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) + if (!mm || mm->context.ia32_compat) return NULL; #endif return &gate_vma; @@ -872,7 +872,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { - struct vm_area_struct *vma = get_gate_vma(task); + struct vm_area_struct *vma = get_gate_vma(task->mm); if (!vma) return 0; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 36df991985b2..1f651f6bdf61 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -417,11 +417,12 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - struct mm_struct *mm = tsk->mm; - - /* Check to see if this task was created in compat vdso mode */ + /* + * Check to see if the corresponding task was created in compat vdso + * mode. + */ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) return &gate_vma; return NULL; @@ -429,7 +430,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { - const struct vm_area_struct *vma = get_gate_vma(task); + const struct vm_area_struct *vma = get_gate_vma(task->mm); return vma && addr >= vma->vm_start && addr < vma->vm_end; } -- cgit v1.2.2 From 83b964bbf82eb13a8f31bb49ca420787fe01f7a6 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:16 -0400 Subject: mm: arch: make in_gate_area take an mm_struct instead of a task_struct Morally, the question of whether an address lies in a gate vma should be asked with respect to an mm, not a particular task. Moreover, dropping the dependency on task_struct will help make existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- arch/x86/mm/init_64.c | 4 ++-- arch/x86/vdso/vdso32-setup.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dd4809b58441..43c441622c89 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -870,9 +870,9 @@ struct vm_area_struct *get_gate_vma(struct mm_struct *mm) return &gate_vma; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = get_gate_vma(task->mm); + struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 1f651f6bdf61..f849bb29fda1 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -428,9 +428,9 @@ struct vm_area_struct *get_gate_vma(struct mm_struct *mm) return NULL; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - const struct vm_area_struct *vma = get_gate_vma(task->mm); + const struct vm_area_struct *vma = get_gate_vma(mm); return vma && addr >= vma->vm_start && addr < vma->vm_end; } -- cgit v1.2.2 From cae5d39032acf26c265f6b1dc73d7ce6ff4bc387 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:17 -0400 Subject: mm: arch: rename in_gate_area_no_task to in_gate_area_no_mm Now that gate vma's are referenced with respect to a particular mm and not a particular task it only makes sense to propagate the change to this predicate as well. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- arch/x86/mm/init_64.c | 8 ++++---- arch/x86/vdso/vdso32-setup.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 43c441622c89..835393c85546 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -881,11 +881,11 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr) } /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index f849bb29fda1..468d591dde31 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -435,7 +435,7 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr) return vma && addr >= vma->vm_start && addr < vma->vm_end; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } -- cgit v1.2.2 From f3c6ea1b06c71b43f751b36bd99345369fe911af Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 Mar 2011 22:15:54 +0100 Subject: x86: Use syscore_ops instead of sysdev classes and sysdevs Some subsystems in the x86 tree need to carry out suspend/resume and shutdown operations with one CPU on-line and interrupts disabled and they define sysdev classes and sysdevs or sysdev drivers for this purpose. This leads to unnecessarily complicated code and excessive memory usage, so switch them to using struct syscore_ops objects for this purpose instead. Generally, there are three categories of subsystems that use sysdevs for implementing PM operations: (1) subsystems whose suspend/resume callbacks ignore their arguments entirely (the majority), (2) subsystems whose suspend/resume callbacks use their struct sys_device argument, but don't really need to do that, because they can be implemented differently in an arguably simpler way (io_apic.c), and (3) subsystems whose suspend/resume callbacks use their struct sys_device argument, but the value of that argument is always the same and could be ignored (microcode_core.c). In all of these cases the subsystems in question may be readily converted to using struct syscore_ops objects for power management and shutdown. Signed-off-by: Rafael J. Wysocki Reviewed-by: Thomas Gleixner Acked-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 26 +++-------- arch/x86/kernel/apic/apic.c | 33 ++++---------- arch/x86/kernel/apic/io_apic.c | 97 +++++++++++++++++++--------------------- arch/x86/kernel/cpu/mcheck/mce.c | 21 +++++---- arch/x86/kernel/cpu/mtrr/main.c | 10 ++--- arch/x86/kernel/i8237.c | 30 +++---------- arch/x86/kernel/i8259.c | 33 +++++--------- arch/x86/kernel/microcode_core.c | 34 ++++++-------- arch/x86/kernel/pci-gart_64.c | 32 +++---------- arch/x86/oprofile/nmi_int.c | 44 +++++------------- 10 files changed, 128 insertions(+), 232 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6e11c8134158..246d727b65b7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -1260,7 +1260,7 @@ static void disable_iommus(void) * disable suspend until real resume implemented */ -static int amd_iommu_resume(struct sys_device *dev) +static void amd_iommu_resume(void) { struct amd_iommu *iommu; @@ -1276,11 +1276,9 @@ static int amd_iommu_resume(struct sys_device *dev) */ amd_iommu_flush_all_devices(); amd_iommu_flush_all_domains(); - - return 0; } -static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) +static int amd_iommu_suspend(void) { /* disable IOMMUs to go out of the way for BIOS */ disable_iommus(); @@ -1288,17 +1286,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static struct sysdev_class amd_iommu_sysdev_class = { - .name = "amd_iommu", +static struct syscore_ops amd_iommu_syscore_ops = { .suspend = amd_iommu_suspend, .resume = amd_iommu_resume, }; -static struct sys_device device_amd_iommu = { - .id = 0, - .cls = &amd_iommu_sysdev_class, -}; - /* * This is the core init function for AMD IOMMU hardware in the system. * This function is called from the generic x86 DMA layer initialization @@ -1415,14 +1407,6 @@ static int __init amd_iommu_init(void) goto free; } - ret = sysdev_class_register(&amd_iommu_sysdev_class); - if (ret) - goto free; - - ret = sysdev_register(&device_amd_iommu); - if (ret) - goto free; - ret = amd_iommu_init_devices(); if (ret) goto free; @@ -1441,6 +1425,8 @@ static int __init amd_iommu_init(void) amd_iommu_init_notifier(); + register_syscore_ops(&amd_iommu_syscore_ops); + if (iommu_pass_through) goto out; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 966673f44141..fabf01eff771 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -2046,7 +2046,7 @@ static struct { unsigned int apic_thmr; } apic_pm_state; -static int lapic_suspend(struct sys_device *dev, pm_message_t state) +static int lapic_suspend(void) { unsigned long flags; int maxlvt; @@ -2084,23 +2084,21 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int lapic_resume(struct sys_device *dev) +static void lapic_resume(void) { unsigned int l, h; unsigned long flags; - int maxlvt; - int ret = 0; + int maxlvt, ret; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) - return 0; + return; local_irq_save(flags); if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - ret = -ENOMEM; goto restore; } @@ -2162,8 +2160,6 @@ static int lapic_resume(struct sys_device *dev) } restore: local_irq_restore(flags); - - return ret; } /* @@ -2171,17 +2167,11 @@ restore: * are needed on every CPU up until machine_halt/restart/poweroff. */ -static struct sysdev_class lapic_sysclass = { - .name = "lapic", +static struct syscore_ops lapic_syscore_ops = { .resume = lapic_resume, .suspend = lapic_suspend, }; -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; @@ -2189,16 +2179,11 @@ static void __cpuinit apic_pm_activate(void) static int __init init_lapic_sysfs(void) { - int error; - - if (!cpu_has_apic) - return 0; /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + if (cpu_has_apic) + register_syscore_ops(&lapic_syscore_ops); - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; + return 0; } /* local apic needs to resume before other devices access its registers. */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 180ca240e03c..68df09bba92e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include @@ -2918,89 +2918,84 @@ static int __init io_apic_bug_finalize(void) late_initcall(io_apic_bug_finalize); -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; +static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS]; -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +static void suspend_ioapic(int ioapic_id) { - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; + struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; int i; - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); + if (!saved_data) + return; + + for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) + saved_data[i] = ioapic_read_entry(ioapic_id, i); +} + +static int ioapic_suspend(void) +{ + int ioapic_id; + + for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++) + suspend_ioapic(ioapic_id); return 0; } -static int ioapic_resume(struct sys_device *dev) +static void resume_ioapic(int ioapic_id) { - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; + struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; unsigned long flags; union IO_APIC_reg_00 reg_00; int i; - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; + if (!saved_data) + return; raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].apicid; - io_apic_write(dev->id, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic_id, 0); + if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) { + reg_00.bits.ID = mp_ioapics[ioapic_id].apicid; + io_apic_write(ioapic_id, 0, reg_00.raw); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); + for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) + ioapic_write_entry(ioapic_id, i, saved_data[i]); +} - return 0; +static void ioapic_resume(void) +{ + int ioapic_id; + + for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) + resume_ioapic(ioapic_id); } -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", +static struct syscore_ops ioapic_syscore_ops = { .suspend = ioapic_suspend, .resume = ioapic_resume, }; -static int __init ioapic_init_sysfs(void) +static int __init ioapic_init_ops(void) { - struct sys_device * dev; - int i, size, error; + int i; - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; + for (i = 0; i < nr_ioapics; i++) { + unsigned int size; - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] + size = nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } + ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL); + if (!ioapic_saved_data[i]) + pr_err("IOAPIC %d: suspend/resume impossible!\n", i); } + register_syscore_ops(&ioapic_syscore_ops); + return 0; } -device_initcall(ioapic_init_sysfs); +device_initcall(ioapic_init_ops); /* * Dynamic irq allocate and deallocation diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ab1122998dba..5a05ef63eb4a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1749,14 +1750,14 @@ static int mce_disable_error_reporting(void) return 0; } -static int mce_suspend(struct sys_device *dev, pm_message_t state) +static int mce_suspend(void) { return mce_disable_error_reporting(); } -static int mce_shutdown(struct sys_device *dev) +static void mce_shutdown(void) { - return mce_disable_error_reporting(); + mce_disable_error_reporting(); } /* @@ -1764,14 +1765,18 @@ static int mce_shutdown(struct sys_device *dev) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static int mce_resume(struct sys_device *dev) +static void mce_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); - - return 0; } +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, +}; + static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); @@ -1808,9 +1813,6 @@ static void mce_enable_ce(void *all) } static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, .name = "machinecheck", }; @@ -2139,6 +2141,7 @@ static __init int mcheck_init_device(void) return err; } + register_syscore_ops(&mce_syscore_ops); register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index bebabec5b448..307dfbbf4a8e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -630,7 +631,7 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device *sysdev, pm_message_t state) +static int mtrr_save(void) { int i; @@ -642,7 +643,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state) return 0; } -static int mtrr_restore(struct sys_device *sysdev) +static void mtrr_restore(void) { int i; @@ -653,12 +654,11 @@ static int mtrr_restore(struct sys_device *sysdev) mtrr_value[i].ltype); } } - return 0; } -static struct sysdev_driver mtrr_sysdev_driver = { +static struct syscore_ops mtrr_syscore_ops = { .suspend = mtrr_save, .resume = mtrr_restore, }; @@ -839,7 +839,7 @@ static int __init mtrr_init_finialize(void) * TBD: is there any system with such CPU which supports * suspend/resume? If no, we should remove the code. */ - sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + register_syscore_ops(&mtrr_syscore_ops); return 0; } diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index b42ca694dc68..8eeaa81de066 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include @@ -21,7 +21,7 @@ * in asm/dma.h. */ -static int i8237A_resume(struct sys_device *dev) +static void i8237A_resume(void) { unsigned long flags; int i; @@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev) enable_dma(4); release_dma_lock(flags); - - return 0; } -static int i8237A_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static struct sysdev_class i8237_sysdev_class = { - .name = "i8237", - .suspend = i8237A_suspend, +static struct syscore_ops i8237_syscore_ops = { .resume = i8237A_resume, }; -static struct sys_device device_i8237A = { - .id = 0, - .cls = &i8237_sysdev_class, -}; - -static int __init i8237A_init_sysfs(void) +static int __init i8237A_init_ops(void) { - int error = sysdev_class_register(&i8237_sysdev_class); - if (!error) - error = sysdev_register(&device_i8237A); - return error; + register_syscore_ops(&i8237_syscore_ops); + return 0; } -device_initcall(i8237A_init_sysfs); +device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index d9ca749c123b..65b8f5c2eebf 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -245,20 +245,19 @@ static void save_ELCR(char *trigger) trigger[1] = inb(0x4d1) & 0xDE; } -static int i8259A_resume(struct sys_device *dev) +static void i8259A_resume(void) { init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); - return 0; } -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +static int i8259A_suspend(void) { save_ELCR(irq_trigger); return 0; } -static int i8259A_shutdown(struct sys_device *dev) +static void i8259A_shutdown(void) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it @@ -266,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev) */ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; } -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", +static struct syscore_ops i8259_syscore_ops = { .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - static void mask_8259A(void) { unsigned long flags; @@ -399,17 +391,12 @@ struct legacy_pic default_legacy_pic = { struct legacy_pic *legacy_pic = &default_legacy_pic; -static int __init i8259A_init_sysfs(void) +static int __init i8259A_init_ops(void) { - int error; - - if (legacy_pic != &default_legacy_pic) - return 0; + if (legacy_pic == &default_legacy_pic) + register_syscore_ops(&i8259_syscore_ops); - error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; + return 0; } -device_initcall(i8259A_init_sysfs); +device_initcall(i8259A_init_ops); diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 87af68e0e1e1..5ed0ab549eb8 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -82,6 +82,7 @@ #include #include #include +#include #include #include @@ -438,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) return 0; } -static int mc_sysdev_resume(struct sys_device *dev) +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, +}; + +/** + * mc_bp_resume - Update boot CPU microcode during resume. + */ +static void mc_bp_resume(void) { - int cpu = dev->id; + int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (!cpu_online(cpu)) - return 0; - - /* - * All non-bootup cpus are still disabled, - * so only CPU 0 will apply ucode here. - * - * Moreover, there can be no concurrent - * updates from any other places at this point. - */ - WARN_ON(cpu != 0); - if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); - - return 0; } -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, +static struct syscore_ops mc_syscore_ops = { + .resume = mc_bp_resume, }; static __cpuinit int @@ -542,6 +535,7 @@ static int __init microcode_init(void) if (error) return error; + register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c01ffa5b9b87..82ada01625b9 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include @@ -589,7 +589,7 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static void gart_fixup_northbridges(struct sys_device *dev) +static void gart_fixup_northbridges(void) { int i; @@ -613,33 +613,20 @@ static void gart_fixup_northbridges(struct sys_device *dev) } } -static int gart_resume(struct sys_device *dev) +static void gart_resume(void) { pr_info("PCI-DMA: Resuming GART IOMMU\n"); - gart_fixup_northbridges(dev); + gart_fixup_northbridges(); enable_gart_translations(); - - return 0; } -static int gart_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, +static struct syscore_ops gart_syscore_ops = { .resume = gart_resume, }; -static struct sys_device device_gart = { - .cls = &gart_sysdev_class, -}; - /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -650,7 +637,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - int i, error; + int i; pr_info("PCI-DMA: Disabling AGP.\n"); @@ -685,12 +672,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - error = sysdev_class_register(&gart_sysdev_class); - if (!error) - error = sysdev_register(&device_gart); - if (error) - panic("Could not register gart_sysdev -- " - "would corrupt data on next suspend"); + register_syscore_ops(&gart_syscore_ops); flush_gart(); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index e2b7b0c06cdf..8dace181c88e 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -536,7 +536,7 @@ static void nmi_shutdown(void) #ifdef CONFIG_PM -static int nmi_suspend(struct sys_device *dev, pm_message_t state) +static int nmi_suspend(void) { /* Only one CPU left, just stop that one */ if (nmi_enabled == 1) @@ -544,49 +544,31 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int nmi_resume(struct sys_device *dev) +static void nmi_resume(void) { if (nmi_enabled == 1) nmi_cpu_start(NULL); - return 0; } -static struct sysdev_class oprofile_sysclass = { - .name = "oprofile", +static struct syscore_ops oprofile_syscore_ops = { .resume = nmi_resume, .suspend = nmi_suspend, }; -static struct sys_device device_oprofile = { - .id = 0, - .cls = &oprofile_sysclass, -}; - -static int __init init_sysfs(void) +static void __init init_suspend_resume(void) { - int error; - - error = sysdev_class_register(&oprofile_sysclass); - if (error) - return error; - - error = sysdev_register(&device_oprofile); - if (error) - sysdev_class_unregister(&oprofile_sysclass); - - return error; + register_syscore_ops(&oprofile_syscore_ops); } -static void exit_sysfs(void) +static void exit_suspend_resume(void) { - sysdev_unregister(&device_oprofile); - sysdev_class_unregister(&oprofile_sysclass); + unregister_syscore_ops(&oprofile_syscore_ops); } #else -static inline int init_sysfs(void) { return 0; } -static inline void exit_sysfs(void) { } +static inline void init_suspend_resume(void) { } +static inline void exit_suspend_resume(void) { } #endif /* CONFIG_PM */ @@ -789,9 +771,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) mux_init(ops); - ret = init_sysfs(); - if (ret) - return ret; + init_suspend_resume(); printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; @@ -799,5 +779,5 @@ int __init op_nmi_init(struct oprofile_operations *ops) void op_nmi_exit(void) { - exit_sysfs(); + exit_suspend_resume(); } -- cgit v1.2.2 From d47d81c0e9abdc3c88653fabff5beae82c949b09 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 Mar 2011 22:16:41 +0100 Subject: Introduce ARCH_NO_SYSDEV_OPS config option (v2) Introduce Kconfig option allowing architectures where sysdev operations used during system suspend, resume and shutdown have been completely replaced with struct sycore_ops operations to avoid building sysdev code that will never be used. Make callbacks in struct sys_device and struct sysdev_driver depend on ARCH_NO_SYSDEV_OPS to allows us to verify if all of the references have been actually removed from the code the given architecture depends on. Make x86 select ARCH_NO_SYSDEV_OPS. Signed-off-by: Rafael J. Wysocki --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d57ddd7573cc..b1cd5a96a511 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -71,6 +71,7 @@ config X86 select GENERIC_IRQ_SHOW select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP + select ARCH_NO_SYSDEV_OPS config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) -- cgit v1.2.2 From 861b5ae7cde96ca081914e21dedfa7e8a38da622 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Mar 2011 16:42:02 -0700 Subject: bitops: introduce little-endian bitops for most architectures Introduce little-endian bit operations to the big-endian architectures which do not have native little-endian bit operations and the little-endian architectures. (alpha, avr32, blackfin, cris, frv, h8300, ia64, m32r, mips, mn10300, parisc, sh, sparc, tile, x86, xtensa) These architectures can just include generic implementation (asm-generic/bitops/le.h). Signed-off-by: Akinobu Mita Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Mikael Starvik Cc: David Howells Cc: Yoshinori Sato Cc: "Luck, Tony" Cc: Ralf Baechle Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Paul Mundt Cc: Kazumoto Kojima Cc: Hirokazu Takata Cc: "David S. Miller" Cc: Chris Zankel Cc: Ingo Molnar Cc: Thomas Gleixner Acked-by: Hans-Christian Egtvedt Acked-by: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/bitops.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 903683b07e42..c68bc101441d 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -456,6 +456,7 @@ static inline int fls(int x) #ifdef __KERNEL__ +#include #include #define ext2_set_bit_atomic(lock, nr, addr) \ -- cgit v1.2.2 From f312eff8164879e04923d41e9dd23e7850937d85 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Mar 2011 16:42:14 -0700 Subject: bitops: remove ext2 non-atomic bitops from asm/bitops.h As the result of conversions, there are no users of ext2 non-atomic bit operations except for ext2 filesystem itself. Now we can put them into architecture independent code in ext2 filesystem, and remove from asm/bitops.h for all architectures. Signed-off-by: Akinobu Mita Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/bitops.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index c68bc101441d..3c95e072c179 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -457,7 +457,6 @@ static inline int fls(int x) #ifdef __KERNEL__ #include -#include #define ext2_set_bit_atomic(lock, nr, addr) \ test_and_set_bit((nr), (unsigned long *)(addr)) -- cgit v1.2.2 From 61f2e7b0f474225b4226772830ae4b29a3a21f8d Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Mar 2011 16:42:16 -0700 Subject: bitops: remove minix bitops from asm/bitops.h minix bit operations are only used by minix filesystem and useless by other modules. Because byte order of inode and block bitmaps is different on each architecture like below: m68k: big-endian 16bit indexed bitmaps h8300, microblaze, s390, sparc, m68knommu: big-endian 32 or 64bit indexed bitmaps m32r, mips, sh, xtensa: big-endian 32 or 64bit indexed bitmaps for big-endian mode little-endian bitmaps for little-endian mode Others: little-endian bitmaps In order to move minix bit operations from asm/bitops.h to architecture independent code in minix filesystem, this provides two config options. CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED is only selected by m68k. CONFIG_MINIX_FS_NATIVE_ENDIAN is selected by the architectures which use native byte order bitmaps (h8300, microblaze, s390, sparc, m68knommu, m32r, mips, sh, xtensa). The architectures which always use little-endian bitmaps do not select these options. Finally, we can remove minix bit operations from asm/bitops.h for all architectures. Signed-off-by: Akinobu Mita Acked-by: Arnd Bergmann Acked-by: Greg Ungerer Cc: Geert Uytterhoeven Cc: Roman Zippel Cc: Andreas Schwab Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Michal Simek Cc: "David S. Miller" Cc: Hirokazu Takata Acked-by: Ralf Baechle Acked-by: Paul Mundt Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/bitops.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 3c95e072c179..69d58131bc8e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -463,7 +463,5 @@ static inline int fls(int x) #define ext2_clear_bit_atomic(lock, nr, addr) \ test_and_clear_bit((nr), (unsigned long *)(addr)) -#include - #endif /* __KERNEL__ */ #endif /* _ASM_X86_BITOPS_H */ -- cgit v1.2.2 From 388b78adc9899f0299128610f566051d0b1a57f6 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Wed, 23 Mar 2011 16:43:03 -0700 Subject: rapidio: modify configuration to support PCI-SRIO controller 1. Add an option to include RapidIO support if the PCI is available. 2. Add FSL_RIO configuration option to enable controller selection. 3. Add RapidIO support option into x86 and MIPS architectures. Signed-off-by: Alexandre Bounine Acked-by: Kumar Gala Cc: Matt Porter Cc: Li Yang Cc: Thomas Moll Cc: Micha Nelissen Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d57ddd7573cc..140e254fe546 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2096,6 +2096,16 @@ source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" +config RAPIDIO + bool "RapidIO support" + depends on PCI + default n + help + If you say Y here, the kernel will include drivers and + infrastructure code to support RapidIO interconnect devices. + +source "drivers/rapidio/Kconfig" + endmenu -- cgit v1.2.2 From 8547727756a7322b99aa313ce50fe15d8f858872 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 23 Mar 2011 16:43:28 -0700 Subject: remove dma64_addr_t There is no user now. Signed-off-by: FUJITA Tomonori Cc: David Miller Cc: Ralf Baechle Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/types.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index 88102055a4b8..8e8c23fef08c 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -3,12 +3,4 @@ #include -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -typedef u64 dma64_addr_t; - -#endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_TYPES_H */ -- cgit v1.2.2 From 93a72052be81823fa1584b9be037d51924f9efa4 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Wed, 23 Mar 2011 16:43:29 -0700 Subject: crash_dump: export is_kdump_kernel to modules, consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn The Xen PV drivers in a crashed HVM guest can not connect to the dom0 backend drivers because both frontend and backend drivers are still in connected state. To run the connection reset function only in case of a crashdump, the is_kdump_kernel() function needs to be available for the PV driver modules. Consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn into kernel/crash_dump.c Also export elfcorehdr_addr to make is_kdump_kernel() usable for modules. Leave 'elfcorehdr' as early_param(). This changes powerpc from __setup() to early_param(). It adds an address range check from x86 also on ia64 and powerpc. [akpm@linux-foundation.org: additional #includes] [akpm@linux-foundation.org: remove elfcorehdr_addr export] [akpm@linux-foundation.org: fix for Tejun's mm/nobootmem.c changes] Signed-off-by: Olaf Hering Cc: Russell King Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/crash_dump_32.c | 3 --- arch/x86/kernel/crash_dump_64.c | 3 --- arch/x86/kernel/e820.c | 1 + arch/x86/kernel/setup.c | 22 ---------------------- 4 files changed, 1 insertion(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index d5cd13945d5a..642f75a68cd5 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -14,9 +14,6 @@ static void *kdump_buf_page; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 994828899e09..afa64adb75ee 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,9 +10,6 @@ #include #include -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cdf5bfd9d4d5..3e2ef8425316 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 32bd87cbf982..5a0484a95ad6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -619,28 +619,6 @@ void __init reserve_standard_io_resources(void) } -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ - -#ifdef CONFIG_CRASH_DUMP -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - static __init void reserve_ibft_region(void) { unsigned long addr, size = 0; -- cgit v1.2.2 From 71f9e59800e5ad4e6b683348424c9fe54306cd43 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 24 Mar 2011 11:42:30 +0900 Subject: x86, dumpstack: Use %pB format specifier for stack trace Improve noreturn function entries in call traces: Before: Call Trace: [] panic+0x8c/0x18d [] deep01+0x0/0x38 [test_panic] <--- bad [] proc_file_write+0x73/0x8d [] proc_reg_write+0x8d/0xac [] vfs_write+0xa1/0xc5 [] sys_write+0x45/0x6c [] system_call_fastpath+0x16/0x1b After: Call Trace: [] panic+0x8c/0x18d [] panic_write+0x20/0x20 [test_panic] <--- good [] proc_file_write+0x73/0x8d [] proc_reg_write+0x8d/0xac [] vfs_write+0xa1/0xc5 [] sys_write+0x45/0x6c [] system_call_fastpath+0x16/0x1b Signed-off-by: Namhyung Kim Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <1300934550-21394-2-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 999e2793590b..24d0479025f9 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,7 +27,7 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pS\n", (void *) address, + printk(" [<%p>] %s%pB\n", (void *) address, reliable ? "" : "? ", (void *) address); } -- cgit v1.2.2 From b254244d2682fe975630f176c25a4444cc4e088d Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Thu, 24 Mar 2011 08:10:07 -0400 Subject: xen/p2m: Allocate p2m tracking pages on override It is possible to add a p2m override on pages that are currently mapped to INVALID_P2M_ENTRY; in particular, this will happen when using ballooned pages in gntdev. This means that set_phys_to_machine must be used instead of __set_phys_to_machine. Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 215a3ce61068..2a44edd606e5 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -671,7 +671,9 @@ int m2p_add_override(unsigned long mfn, struct page *page) page->private = mfn; page->index = pfn_to_mfn(pfn); - __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) + return -ENOMEM; + if (!PageHighMem(page)) /* Just zap old mapping for now */ pte_clear(&init_mm, address, ptep); @@ -709,7 +711,7 @@ int m2p_remove_override(struct page *page) spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); - __set_phys_to_machine(pfn, page->index); + set_phys_to_machine(pfn, page->index); if (!PageHighMem(page)) set_pte_at(&init_mm, address, ptep, -- cgit v1.2.2 From 914a76ca5eedc9f286a36f61c4eaa95b451ba3e6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Mar 2011 15:44:33 -0400 Subject: oprofile, x86: Allow setting EDGE/INV/CMASK for counter events For some performance events it's useful to set the EDGE and INV bits and the CMASK mask in the counter control register. The list of predefined events Intel releases for each CPU has some events which require these settings to get more "natural" to use higher level events. oprofile currently doesn't allow this. This patch adds new extra configuration fields for them, so that they can be specified in oprofilefs. An updated oprofile daemon can then make use of this to set them. v2: Write back masked extra value to variable. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 5 +++++ arch/x86/oprofile/op_counter.h | 1 + 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index e2b7b0c06cdf..ee165f1a1f37 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -49,6 +49,10 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; val |= (counter_config->unit_mask & 0xFF) << 8; + counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_EDGE | + ARCH_PERFMON_EVENTSEL_CMASK); + val |= counter_config->extra; event &= model->event_mask ? model->event_mask : 0xFF; val |= event & 0xFF; val |= (event & 0x0F00) << 24; @@ -440,6 +444,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); } return 0; diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index e28398df0df2..0b7b7b179cbe 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -22,6 +22,7 @@ struct op_counter_config { unsigned long kernel; unsigned long user; unsigned long unit_mask; + unsigned long extra; }; extern struct op_counter_config counter_config[]; -- cgit v1.2.2 From 242214f9c1eeaae40eca11e3b4d37bfce960a7cd Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 24 Mar 2011 23:36:25 +0300 Subject: perf, x86: P4 PMU - Read proper MSR register to catch unflagged overflows The read of a proper MSR register was missed and instead of counter the configration register was tested (it has ARCH_P4_UNFLAGGED_BIT always cleared) leading to unknown NMI hitting the system. As result the user may obtain "Dazed and confused, but trying to continue" message. Fix it by reading a proper MSR register. When an NMI happens on a P4, the perf nmi handler checks the configuration register to see if the overflow bit is set or not before taking appropriate action. Unfortunately, various P4 machines had a broken overflow bit, so a backup mechanism was implemented. This mechanism checked to see if the counter rolled over or not. A previous commit that implemented this backup mechanism was broken. Instead of reading the counter register, it used the configuration register to determine if the counter rolled over or not. Reading that bit would give incorrect results. This would lead to 'Dazed and confused' messages for the end user when using the perf tool (or if the nmi watchdog is running). The fix is to read the counter register before determining if the counter rolled over or not. Signed-off-by: Don Zickus Signed-off-by: Cyrill Gorcunov Cc: Lin Ming LKML-Reference: <4D8BAB49.3080701@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 3769ac822f96..d3d7b59841e5 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -777,6 +777,7 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) * the counter has reached zero value and continued counting before * real NMI signal was received: */ + rdmsrl(hwc->event_base, v); if (!(v & ARCH_P4_UNFLAGGED_BIT)) return 1; -- cgit v1.2.2 From 00a30b254b88d2d4f5af00835a9b7f70326def9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Mar 2011 22:53:10 +0100 Subject: x86: DT: Fix return condition in irq_create_of_mapping() The xlate() function returns 0 or a negative error code. Returning the error code blindly will be seen as an huge irq number by the calling function because irq_create_of_mapping() returns an unsigned value. Return 0 (NO_IRQ) as required. Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior --- arch/x86/kernel/devicetree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 7a8cebc9ff29..9c91badb6ca9 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -65,7 +65,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, return 0; ret = ih->xlate(ih, intspec, intsize, &virq, &type); if (ret) - return ret; + return 0; if (type == IRQ_TYPE_NONE) return virq; /* set the mask if it is different from current */ -- cgit v1.2.2 From 07611dbda5ccbd9a628f29686d62bafdd007db7b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Mar 2011 21:41:57 +0100 Subject: x86: DT: Cleanup namespace and call irq_set_irq_type() unconditional That call escaped the name space cleanup. Fix it up. We really want to call there. The chip might have changed since the irq was setup initially. So let the core code and the chip decide what to do. The status is just an unreliable snapshot. Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior --- arch/x86/kernel/devicetree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 9c91badb6ca9..706a9fb46a58 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -68,9 +68,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, return 0; if (type == IRQ_TYPE_NONE) return virq; - /* set the mask if it is different from current */ - if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK)) - set_irq_type(virq, type); + irq_set_irq_type(virq, type); return virq; } EXPORT_SYMBOL_GPL(irq_create_of_mapping); -- cgit v1.2.2 From 45daae575e08bbf7405c5a3633e956fa364d1b4f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 25 Mar 2011 10:24:23 +0100 Subject: perf, x86: Complain louder about BIOSen corrupting CPU/PMU state and continue Eric Dumazet reported that hardware PMU events do not work on his system, due to the BIOS corrupting PMU state: Performance Events: PEBS fmt0+, Core2 events, Broken BIOS detected, using software events only. [Firmware Bug]: the BIOS has corrupted hw-PMU resources (MSR 186 is 43003c) Linus suggested that we continue in the face of such BIOS-induced CPU state corruption: http://lkml.org/lkml/2011/3/24/608 Such BIOSes will have to be fixed - Linux developers rely on a working and fully capable PMU and the BIOS interfering with the CPU's PMU state is simply not acceptable. So this patch changes perf to continue when it detects such BIOS interaction, some hardware events may be unreliable due to the BIOS writing and re-writing them - there's not much the kernel can do about that but to detect the corruption and report it. Reported-and-tested-by: Eric Dumazet Suggested-by: Linus Torvalds Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ec46eea0c4ed..eb00677ee2ae 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -500,12 +500,17 @@ static bool check_hw_exists(void) return true; bios_fail: - printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); + /* + * We still allow the PMU driver to operate: + */ + printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); - return false; + + return true; msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + return false; } -- cgit v1.2.2 From 21431c2900a0b669080b5bfaae2a7d9d9c026e9b Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 15 Mar 2010 07:28:00 -0500 Subject: kgdb,x86_64: fix compile warning found with sparse Fix sparse warning: arch/x86/kernel/kgdb.c:123:9: warning: switch with no cases Reported-by: Namhyung Kim Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index a4130005028a..3c2fb0f25abd 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -121,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, dbg_reg_def[regno].size); - switch (regno) { #ifdef CONFIG_X86_32 + switch (regno) { case GDB_SS: if (!user_mode_vm(regs)) *(unsigned long *)mem = __KERNEL_DS; @@ -135,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) case GDB_FS: *(unsigned long *)mem = 0xFFFF; break; -#endif } +#endif return dbg_reg_def[regno].name; } -- cgit v1.2.2 From fa1df691688f34cbcd5bf77bd084bbe47e9d6bfe Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 21 Mar 2011 19:19:35 -0700 Subject: mfd: Add mfd_clone_cell(), convert cs5535-mfd/olpc-xo1 to it Replace mfd_shared_platform_driver_register with mfd_clone_cell. The former was called by an mfd client, and registered both a platform driver and device. The latter is called by an mfd driver, and registers only a platform device. The downside of this is that mfd drivers need to be modified whenever new clients are added that share a cell; the upside is that it fits Linux's driver model better. It's also simpler. This also converts cs5535-mfd/olpc-xo1 from the old API. cs5535-mfd now creates the olpc-xo1-{acpi,pms} devices, while olpc-xo1 binds to them via platform drivers. Signed-off-by: Andres Salomon Signed-off-by: Samuel Ortiz --- arch/x86/platform/olpc/olpc-xo1.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index 99513642a0e6..386e3a159cca 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -121,22 +121,21 @@ static int __init olpc_xo1_init(void) { int r; - r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms"); + r = platform_driver_register(&cs5535_pms_drv); if (r) return r; - r = mfd_shared_platform_driver_register(&cs5535_acpi_drv, - "cs5535-acpi"); + r = platform_driver_register(&cs5535_acpi_drv); if (r) - mfd_shared_platform_driver_unregister(&cs5535_pms_drv); + platform_driver_unregister(&cs5535_pms_drv); return r; } static void __exit olpc_xo1_exit(void) { - mfd_shared_platform_driver_unregister(&cs5535_acpi_drv); - mfd_shared_platform_driver_unregister(&cs5535_pms_drv); + platform_driver_unregister(&cs5535_acpi_drv); + platform_driver_unregister(&cs5535_pms_drv); } MODULE_AUTHOR("Daniel Drake "); -- cgit v1.2.2 From adfa4bd4a8bfc53ca7370c57be240d35c2ec28e2 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Tue, 22 Mar 2011 13:50:39 -0700 Subject: mfd: OLPC: Clean up names to match what OLPC actually uses The cs5535-pms cell doesn't actually need to be cloned, so we can drop that and simply have the olpc-xo1.c driver use "cs5535-pms" directly. Also, rename the cs5535-acpi clones to what we actually use for the (currently out-of-tree) SCI driver. In the process, that fixes a subtle bug in olpc-xo1.c which broke powerdown on XO-1s.. olpc-xo1-ac-acpi was a typo, not something that actually existed. Signed-off-by: Daniel Drake Signed-off-by: Andres Salomon Signed-off-by: Samuel Ortiz --- arch/x86/platform/olpc/olpc-xo1.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index 386e3a159cca..ab81fb271760 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -72,9 +72,9 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev) dev_err(&pdev->dev, "can't fetch device resource info\n"); return -EIO; } - if (strcmp(pdev->name, "olpc-xo1-pms") == 0) + if (strcmp(pdev->name, "cs5535-pms") == 0) pms_base = res->start; - else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0) acpi_base = res->start; /* If we have both addresses, we can override the poweroff hook */ @@ -90,9 +90,9 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev) { mfd_cell_disable(pdev); - if (strcmp(pdev->name, "olpc-xo1-pms") == 0) + if (strcmp(pdev->name, "cs5535-pms") == 0) pms_base = 0; - else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0) acpi_base = 0; pm_power_off = NULL; @@ -101,7 +101,7 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev) static struct platform_driver cs5535_pms_drv = { .driver = { - .name = "olpc-xo1-pms", + .name = "cs5535-pms", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -110,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = { static struct platform_driver cs5535_acpi_drv = { .driver = { - .name = "olpc-xo1-acpi", + .name = "olpc-xo1-pm-acpi", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, -- cgit v1.2.2 From 60af520cf264ea26b2af3a6871bbd71850522aea Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Sun, 13 Mar 2011 16:56:17 +0800 Subject: crypto: aesni-intel - fixed problem with packets that are not multiple of 64bytes This patch fixes problem with packets that are not multiple of 64bytes. Signed-off-by: Adrian Hoban Signed-off-by: Aidan O'Mahony Signed-off-by: Gabriele Paoloni Signed-off-by: Tadeusz Struk Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 5 ++++- arch/x86/crypto/aesni-intel_glue.c | 14 ++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index adcf794b22e2..be6d9e365a80 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -1612,6 +1612,7 @@ _zero_cipher_left_encrypt: movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) sub $16, %r11 add %r13, %r11 @@ -1634,7 +1635,9 @@ _zero_cipher_left_encrypt: # GHASH computation for the last <16 byte block sub %r13, %r11 add $16, %r11 - PSHUFB_XMM %xmm10, %xmm1 + + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm0 # shuffle xmm0 back to output as ciphertext diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index e0e6340c8dad..2577613fb32b 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -828,9 +828,15 @@ static int rfc4106_init(struct crypto_tfm *tfm) struct cryptd_aead *cryptd_tfm; struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); + struct crypto_aead *cryptd_child; + struct aesni_rfc4106_gcm_ctx *child_ctx; cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); + + cryptd_child = cryptd_aead_child(cryptd_tfm); + child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child); + memcpy(child_ctx, ctx, sizeof(*ctx)); ctx->cryptd_tfm = cryptd_tfm; tfm->crt_aead.reqsize = sizeof(struct aead_request) + crypto_aead_reqsize(&cryptd_tfm->base); @@ -923,6 +929,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, int ret = 0; struct crypto_tfm *tfm = crypto_aead_tfm(parent); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + struct aesni_rfc4106_gcm_ctx *child_ctx = + aesni_rfc4106_gcm_ctx_get(cryptd_child); u8 *new_key_mem = NULL; if (key_len < 4) { @@ -966,6 +975,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, goto exit; } ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); + memcpy(child_ctx, ctx, sizeof(*ctx)); exit: kfree(new_key_mem); return ret; @@ -997,7 +1007,6 @@ static int rfc4106_encrypt(struct aead_request *req) int ret; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); if (!irq_fpu_usable()) { struct aead_request *cryptd_req = @@ -1006,6 +1015,7 @@ static int rfc4106_encrypt(struct aead_request *req) aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); return crypto_aead_encrypt(cryptd_req); } else { + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); kernel_fpu_begin(); ret = cryptd_child->base.crt_aead.encrypt(req); kernel_fpu_end(); @@ -1018,7 +1028,6 @@ static int rfc4106_decrypt(struct aead_request *req) int ret; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); if (!irq_fpu_usable()) { struct aead_request *cryptd_req = @@ -1027,6 +1036,7 @@ static int rfc4106_decrypt(struct aead_request *req) aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); return crypto_aead_decrypt(cryptd_req); } else { + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); kernel_fpu_begin(); ret = cryptd_child->base.crt_aead.decrypt(req); kernel_fpu_end(); -- cgit v1.2.2 From d7c3f8cee81f4548de0513403b74131aee655576 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 26 Mar 2011 20:57:18 -0500 Subject: percpu: Omit segment prefix in the UP case for cmpxchg_double Omit the segment prefix in the UP case. GS is not used then and we will generate segfaults if cmpxchg16b is used otherwise. Signed-off-by: Christoph Lameter Signed-off-by: Linus Torvalds --- arch/x86/include/asm/percpu.h | 10 ++++++---- arch/x86/lib/cmpxchg16b_emu.S | 14 ++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index a09e1f052d84..d475b4398d8b 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -45,7 +45,7 @@ #include #ifdef CONFIG_SMP -#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x +#define __percpu_prefix "%%"__stringify(__percpu_seg)":" #define __my_cpu_offset percpu_read(this_cpu_off) /* @@ -62,9 +62,11 @@ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ }) #else -#define __percpu_arg(x) "%P" #x +#define __percpu_prefix "" #endif +#define __percpu_arg(x) __percpu_prefix "%P" #x + /* * Initialized pointers to per-cpu variables needed for the boot * processor need to use these macros to get the proper address @@ -516,11 +518,11 @@ do { \ typeof(o2) __n2 = n2; \ typeof(o2) __dummy; \ alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \ - "cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t", \ + "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ X86_FEATURE_CX16, \ ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ "S" (&pcp1), "b"(__n1), "c"(__n2), \ - "a"(__o1), "d"(__o2)); \ + "a"(__o1), "d"(__o2) : "memory"); \ __ret; \ }) diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 3e8b08a6de2b..1e572c507d06 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -10,6 +10,12 @@ #include #include +#ifdef CONFIG_SMP +#define SEG_PREFIX %gs: +#else +#define SEG_PREFIX +#endif + .text /* @@ -37,13 +43,13 @@ this_cpu_cmpxchg16b_emu: pushf cli - cmpq %gs:(%rsi), %rax + cmpq SEG_PREFIX(%rsi), %rax jne not_same - cmpq %gs:8(%rsi), %rdx + cmpq SEG_PREFIX 8(%rsi), %rdx jne not_same - movq %rbx, %gs:(%rsi) - movq %rcx, %gs:8(%rsi) + movq %rbx, SEG_PREFIX(%rsi) + movq %rcx, SEG_PREFIX 8(%rsi) popf mov $1, %al -- cgit v1.2.2 From ca444564a947034557a85357b3911d067cac4b8f Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 25 Mar 2011 15:20:14 +0100 Subject: x86: Stop including in two asm header files Stop including in x86 header files which don't need it. This will let the compiler complain when this header is not included by source files when it should, so that contributors can fix the problem before building on other architectures starts to fail. Credits go to Geert for the idea. Signed-off-by: Jean Delvare Cc: James E.J. Bottomley Cc: Geert Uytterhoeven Cc: Stephen Rothwell LKML-Reference: <20110325152014.297890ec@endymion.delvare> [ this also fixes an upstream build bug in drivers/media/rc/ite-cir.c ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 1 - arch/x86/include/asm/dma.h | 1 - arch/x86/kernel/apic/hw_nmi.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + arch/x86/kernel/irq.c | 1 + arch/x86/kernel/reboot.c | 1 + arch/x86/platform/uv/tlb_uv.c | 1 + 7 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a279d98ea95e..2b7d573be549 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -2,7 +2,6 @@ #define _ASM_X86_APIC_H #include -#include #include #include diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 97b6d8114a43..057099e5faba 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -10,7 +10,6 @@ #include /* And spinlocks */ #include /* need byte IO */ -#include #ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER #define dma_outb outb_p diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c4e557a1ebb6..5260fe91bcb6 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(void) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 3c289281394c..d2cf39bc5ecf 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 948a31eae75f..1cb0b9fc78dc 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d3ce37edb54d..08c44b08bf5b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index a7b38d35c29a..7cb6424317f6 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include -- cgit v1.2.2 From 4ac5fc6a3e4d90120f292526bcaa5ee182a7411b Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 29 Mar 2011 16:34:32 +0800 Subject: x86, microcode: Unregister syscore_ops after microcode unloaded Currently, microcode doesn't unregister syscore_ops after it's unloaded. So if we modprobe then rmmod microcode, the stale microcode syscore_ops info will stay on syscore_ops_list. Later when we're trying to reboot/halt/shutdown the machine, kernel will panic on syscore_shutdown(). With the patch applied, I can reboot/halt/shutdown my machine successfully. Signed-off-by: Xiaotian Feng Cc: Tigran Aivazian Cc: Rafael J. Wysocki LKML-Reference: <1301387672-23661-1-git-send-email-dfeng@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 5ed0ab549eb8..f9242800bc84 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -550,6 +550,7 @@ static void __exit microcode_exit(void) microcode_dev_exit(); unregister_hotcpu_notifier(&mc_cpu_notifier); + unregister_syscore_ops(&mc_syscore_ops); get_online_cpus(); mutex_lock(µcode_mutex); -- cgit v1.2.2 From b83c6e55ac482f08984504d61382ecf05f0afe32 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 24 Mar 2011 13:34:32 -0700 Subject: xen: fix p2m section mismatches Fix section mismatch warnings: set_phys_range_identity() is called by __init xen_set_identity(), so also mark set_phys_range_identity() as __init. then: __early_alloc_p2m() is called set_phys_range_identity(), so also mark __early_alloc_p2m() as __init. WARNING: arch/x86/built-in.o(.text+0x7856): Section mismatch in reference from the function __early_alloc_p2m() to the function .init.text:extend_brk() The function __early_alloc_p2m() references the function __init extend_brk(). This is often because __early_alloc_p2m lacks a __init annotation or the annotation of extend_brk is wrong. WARNING: arch/x86/built-in.o(.text+0x7967): Section mismatch in reference from the function set_phys_range_identity() to the function .init.text:extend_brk() The function set_phys_range_identity() references the function __init extend_brk(). This is often because set_phys_range_identity lacks a __init annotation or the annotation of extend_brk is wrong. [v2: Per Stephen Hemming recommonedation made __early_alloc_p2m static] Signed-off-by: Randy Dunlap Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 2a44edd606e5..141eb0de8b06 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -497,7 +497,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -bool __early_alloc_p2m(unsigned long pfn) +static bool __init __early_alloc_p2m(unsigned long pfn) { unsigned topidx, mididx, idx; @@ -530,7 +530,7 @@ bool __early_alloc_p2m(unsigned long pfn) } return idx != 0; } -unsigned long set_phys_range_identity(unsigned long pfn_s, +unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { unsigned long pfn; -- cgit v1.2.2 From 86cc8dfc211695193a060a240ac9c9287606e5d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Mar 2011 00:09:01 +0200 Subject: x86: apb_timer: Fixup genirq fallout The lonely user of the internal interface was not in the coccinelle script. Reported-by: Randy Dunlap Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apb_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 1293c709ee85..cd1ffed4ee22 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -316,7 +316,7 @@ static void apbt_setup_irq(struct apbt_dev *adev) irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); /* APB timer irqs are set up as mp_irqs, timer is edge type */ - __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); + __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); if (system_state == SYSTEM_BOOTING) { if (request_irq(adev->irq, apbt_interrupt_handler, -- cgit v1.2.2 From 84ac7cdbdd0f04df6b96153f7a79127fd6e45467 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Mar 2011 15:38:12 -0700 Subject: x86, mtrr, pat: Fix one cpu getting out of sync during resume On laptops with core i5/i7, there were reports that after resume graphics workloads were performing poorly on a specific AP, while the other cpu's were ok. This was observed on a 32bit kernel specifically. Debug showed that the PAT init was not happening on that AP during resume and hence it contributing to the poor workload performance on that cpu. On this system, resume flow looked like this: 1. BP starts the resume sequence and we reinit BP's MTRR's/PAT early on using mtrr_bp_restore() 2. Resume sequence brings all AP's online 3. Resume sequence now kicks off the MTRR reinit on all the AP's. 4. For some reason, between point 2 and 3, we moved from BP to one of the AP's. My guess is that printk() during resume sequence is contributing to this. We don't see similar behavior with the 64bit kernel but there is no guarantee that at this point the remaining resume sequence (after AP's bringup) has to happen on BP. 5. set_mtrr() was assuming that we are still on BP and skipped the MTRR/PAT init on that cpu (because of 1 above) 6. But we were on an AP and this led to not reprogramming PAT on this cpu leading to bad performance. Fix this by doing unconditional mtrr_if->set_all() in set_mtrr() during MTRR/PAT init. This might be unnecessary if we are still running on BP. But it is of no harm and will guarantee that after resume, all the cpu's will be in sync with respect to the MTRR/PAT registers. Signed-off-by: Suresh Siddha LKML-Reference: <1301438292-28370-1-git-send-email-eric@anholt.net> Signed-off-by: Eric Anholt Tested-by: Keith Packard Cc: stable@kernel.org [v2.6.32+] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 307dfbbf4a8e..929739a653d1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -293,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ /* * HACK! - * We use this same function to initialize the mtrrs on boot. - * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. - * If we're doing that @reg is set to something special... + * + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * this cpu we still do mtrr_if->set_all(). During boot/resume, this + * is unnecessary if at this point we are still on the cpu that started + * the boot/resume sequence. But there is no guarantee that we are still + * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be + * sure that we are in sync with everyone else. */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); - else if (!mtrr_aps_delayed_init) + else mtrr_if->set_all(); /* Wait for the others */ -- cgit v1.2.2 From cb6c8520f6f6bba7b7e1a6de3360a8edfd8243b6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 30 Mar 2011 20:34:47 +0200 Subject: x86, amd-nb: Rename CPU PCI id define for F4 With increasing number of PCI function ids, add the PCI function id in the define name instead of its symbolic name in the BKDG for more clarity. This renames function 4 define. Signed-off-by: Borislav Petkov Cc: Jesse Barnes LKML-Reference: <20110330183447.GA3668@aftab> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_nb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 6801959a8b2a..4c39baa8facc 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,7 +21,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { EXPORT_SYMBOL(amd_nb_misc_ids); static struct pci_device_id amd_nb_link_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_LINK) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, {} }; -- cgit v1.2.2 From 818987e9a19c52240ba9b1c20f28f047eef76072 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 31 Mar 2011 09:32:02 -0500 Subject: x86, UV: Fix kdump reboot After a crash dump on an SGI Altix UV system the crash kernel fails to cause a reboot. EFI mode is disabled in the kdump kernel, so only the reboot_type of BOOT_ACPI works. Signed-off-by: Cliff Wickman Cc: rja@sgi.com LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d2cf39bc5ecf..33b10a0fc095 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -811,4 +813,11 @@ void __init uv_system_init(void) /* register Legacy VGA I/O redirection handler */ pci_register_set_vga_state(uv_set_vga_state); + + /* + * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as + * EFI is not enabled in the kdump kernel. + */ + if (is_kdump_kernel()) + reboot_type = BOOT_ACPI; } -- cgit v1.2.2 From a4dd99250dc49031e6a92a895dbcc230a4832083 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Apr 2011 07:15:14 -0700 Subject: rcu: create new rcu_access_index() and use in mce The MCE subsystem needs to sample an RCU-protected index outside of any protection for that index. If this was a pointer, we would use rcu_access_pointer(), but there is no corresponding rcu_access_index(). This commit therefore creates an rcu_access_index() and applies it to MCE. Signed-off-by: Paul E. McKenney Tested-by: Zdenek Kabelac --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a05ef63eb4a..3385ea26f684 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1626,7 +1626,7 @@ out: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference_check_mce(mcelog.next)) + if (rcu_access_index(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; -- cgit v1.2.2 From 43a6246f9c41e4d4cc3da0c62c5c44bce569ca2d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Apr 2011 16:46:20 +0200 Subject: x86: visws: Fixup irq overhaul fallout Reported-by: Ian Campbell Signed-off-by: Thomas Gleixner --- arch/x86/platform/visws/visws_quirks.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index fe4cf8294878..c7abf13a213f 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -471,15 +471,7 @@ static unsigned int startup_piix4_master_irq(struct irq_data *data) { legacy_pic->init(0); enable_cobalt_irq(data); -} - -static void end_piix4_master_irq(struct irq_data *data) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - enable_cobalt_irq(data); - spin_unlock_irqrestore(&cobalt_lock, flags); + return 0; } static struct irq_chip piix4_master_irq_type = { @@ -492,7 +484,7 @@ static void pii4_mask(struct irq_data *data) { } static struct irq_chip piix4_virtual_irq_type = { .name = "PIIX4-virtual", - .mask = pii4_mask, + .irq_mask = pii4_mask, }; /* @@ -580,9 +572,9 @@ static struct irqaction cascade_action = { static inline void set_piix4_virtual_irq_type(void) { - piix4_virtual_irq_type.enable = i8259A_chip.unmask; - piix4_virtual_irq_type.disable = i8259A_chip.mask; - piix4_virtual_irq_type.unmask = i8259A_chip.unmask; + piix4_virtual_irq_type.irq_enable = i8259A_chip.irq_unmask; + piix4_virtual_irq_type.irq_disable = i8259A_chip.irq_mask; + piix4_virtual_irq_type.irq_unmask = i8259A_chip.irq_unmask; } static void __init visws_pre_intr_init(void) @@ -599,7 +591,7 @@ static void __init visws_pre_intr_init(void) else if (i == CO_IRQ_IDE0) chip = &cobalt_irq_type; else if (i == CO_IRQ_IDE1) - >chip = &cobalt_irq_type; + chip = &cobalt_irq_type; else if (i == CO_IRQ_8259) chip = &piix4_master_irq_type; else if (i < CO_IRQ_APIC0) -- cgit v1.2.2 From 765af22da8a61bd44d354b3c3be955c332325b2f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Apr 2011 03:06:45 -0700 Subject: x86-32, NUMA: Fix ACPI NUMA init broken by recent x86-64 change Commit d8fc3afc49 (x86, NUMA: Move *_numa_init() invocations into initmem_init()) moved acpi_numa_init() call into NUMA initmem_init() but forgot to update 32bit NUMA init breaking ACPI NUMA configuration for 32bit. acpi_numa_init() call was later moved again to srat_64.c. Match it by adding the call to get_memcfg_from_srat() in srat_32.c. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Cyrill Gorcunov Cc: David Rientjes Cc: H. Peter Anvin LKML-Reference: <20110404100645.GE1420@mtj.dyndns.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 48651c6f657d..364f36bdfad8 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -211,10 +211,12 @@ int __init get_memcfg_from_srat(void) { int i, j, nid; - if (srat_disabled()) goto out_fail; + if (acpi_numa_init() < 0) + goto out_fail; + if (num_memory_chunks == 0) { printk(KERN_DEBUG "could not find any ACPI SRAT memory areas.\n"); -- cgit v1.2.2 From d88885d0923ae27b01dfcec644f94829b1e46bea Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 4 Apr 2011 14:48:20 -0400 Subject: xen/debug: Don't be so verbose with WARN on 1-1 mapping errors. There are valid situations in which this error is not a warning. Mainly when QEMU maps a guest memory and uses the VM_IO flag to set the MFNs. For right now make the WARN be WARN_ONCE. In the future we will: 1). Remove the VM_IO code handling.. 2). .. which will also remove this debug facility. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c82df6c9c0f0..a991b57f91fe 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -565,13 +565,13 @@ pte_t xen_make_pte_debug(pteval_t pte) if (io_page && (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; - WARN(addr != other_addr, + WARN_ONCE(addr != other_addr, "0x%lx is using VM_IO, but it is 0x%lx!\n", (unsigned long)addr, (unsigned long)other_addr); } else { pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; other_addr = (_pte.pte & PTE_PFN_MASK); - WARN((addr == other_addr) && (!io_page) && (!iomap_set), + WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set), "0x%lx is missing VM_IO (and wasn't fixed)!\n", (unsigned long)addr); } -- cgit v1.2.2 From 20800bc940af671257abc97ad362abe3c21ddd50 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 30 Mar 2011 15:01:45 +0200 Subject: KVM: fix XSAVE bit scanning When KVM scans the 0xD CPUID leaf for propagating the XSAVE save area leaves, it assumes that the leaves are contigious and stops at the first zero one. On AMD hardware there is a gap, though, as LWP uses leaf 62 to announce it's state save area. So lets iterate through all 64 possible leaves and simply skip zero ones to also cover later features. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 58f517b59645..4a1ba05f7af3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2395,9 +2395,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, int i; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (i = 1; *nent < maxnent; ++i) { - if (entry[i - 1].eax == 0 && i != 2) - break; + for (i = 1; *nent < maxnent && i < 64; ++i) { + if (entry[i].eax == 0) + continue; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; -- cgit v1.2.2 From bd22f5cfcfe8f68bf43b72daf4530cd7eedc9b7a Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 31 Mar 2011 16:58:49 +0200 Subject: KVM: move and fix substitue search for missing CPUID entries If KVM cannot find an exact match for a requested CPUID leaf, the code will try to find the closest match instead of simply confessing it's failure. The implementation was meant to satisfy the CPUID specification, but did not properly check for extended and standard leaves and also didn't account for the index subleaf. Beside that this rule only applies to CPUID intercepts, which is not the only user of the kvm_find_cpuid_entry() function. So fix this algorithm and call it from kvm_emulate_cpuid(). This fixes a crash of newer Linux kernels as KVM guests on AMD Bulldozer CPUs, where bogus values were returned in response to a CPUID intercept. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4a1ba05f7af3..934b4c6b0bf9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4958,12 +4958,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, best = e; break; } - /* - * Both basic or both extended? - */ - if (((e->function ^ function) & 0x80000000) == 0) - if (!best || e->function > best->function) - best = e; } return best; } @@ -4983,6 +4977,27 @@ not_found: return 36; } +/* + * If no match is found, check whether we exceed the vCPU's limit + * and return the content of the highest valid _standard_ leaf instead. + * This is to satisfy the CPUID specification. + */ +static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *maxlevel; + + maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + if (!maxlevel || maxlevel->eax >= function) + return NULL; + if (function & 0x80000000) { + maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); + if (!maxlevel) + return NULL; + } + return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); +} + void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 function, index; @@ -4995,6 +5010,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RCX, 0); kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = kvm_find_cpuid_entry(vcpu, function, index); + + if (!best) + best = check_cpuid_limit(vcpu, function, index); + if (best) { kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); -- cgit v1.2.2 From 61f4237d5b005767a76f4f3694e68e6f78f392d9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 18 Sep 2010 22:25:30 -0700 Subject: xen: just completely disable XSAVE Some (old) versions of Xen just kill the domain if it tries to set any unknown bits in CR4, so we can't reliably probe for OSXSAVE in CR4. Since Xen doesn't support XSAVE for guests at the moment, and no such support is being worked on, there's no downside in just unconditionally masking XSAVE support. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 49dbd78ec3cb..66272a237622 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -250,23 +250,7 @@ static __init void xen_init_cpuid_mask(void) ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ (1 << X86_FEATURE_ACPI)); /* disable ACPI */ - ax = 1; - cx = 0; - xen_cpuid(&ax, &bx, &cx, &dx); - - /* cpuid claims we support xsave; try enabling it to see what happens */ - if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { - unsigned long cr4; - - set_in_cr4(X86_CR4_OSXSAVE); - - cr4 = read_cr4(); - - if ((cr4 & X86_CR4_OSXSAVE) == 0) - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); - - clear_in_cr4(X86_CR4_OSXSAVE); - } + cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); /* disable XSAVE */ } static void xen_set_debugreg(int reg, unsigned long val) -- cgit v1.2.2 From 947ccf9c3c30307b774af3666ee74fcd9f47f646 Mon Sep 17 00:00:00 2001 From: Shan Haitao Date: Tue, 9 Nov 2010 11:43:36 -0800 Subject: xen: Allow PV-OPS kernel to detect whether XSAVE is supported Xen fails to mask XSAVE from the cpuid feature, despite not historically supporting guest use of XSAVE. However, now that XSAVE support has been added to Xen, we need to reliably detect its presence. The most reliable way to do this is to look at the OSXSAVE feature in cpuid which is set iff the OS (Xen, in this case), has set CR4.OSXSAVE. [ Cleaned up conditional a bit. - Jeremy ] Signed-off-by: Shan Haitao Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 66272a237622..e3c6a06cf725 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -238,6 +238,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, static __init void xen_init_cpuid_mask(void) { unsigned int ax, bx, cx, dx; + unsigned int xsave_mask; cpuid_leaf1_edx_mask = ~((1 << X86_FEATURE_MCE) | /* disable MCE */ @@ -249,8 +250,16 @@ static __init void xen_init_cpuid_mask(void) cpuid_leaf1_edx_mask &= ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + ax = 1; + xen_cpuid(&ax, &bx, &cx, &dx); - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); /* disable XSAVE */ + xsave_mask = + (1 << (X86_FEATURE_XSAVE % 32)) | + (1 << (X86_FEATURE_OSXSAVE % 32)); + + /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ + if ((cx & xsave_mask) != xsave_mask) + cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ } static void xen_set_debugreg(int reg, unsigned long val) -- cgit v1.2.2 From 4da9484bdece39ab0b098fa711e095e3e9fc8684 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 6 Apr 2011 13:10:02 -0700 Subject: x86, hibernate: Initialize mmu_cr4_features during boot Restore the initialization of mmu_cr4_features during boot, which was removed without comment in checkin e5f15b45ddf3afa2bbbb10c7ea34fb32b6de0a0e x86: Cleanup highmap after brk is concluded thereby breaking resume from hibernate. This restores previous functionality in approximately the same place, and corrects the reading of %cr4 on pre-CPUID hardware (%cr4 exists if and only if CPUID is supported.) However, part of the problem is that the hibernate suspend/resume sequence should manage the save/restore of %cr4 explicitly. Signed-off-by: H. Peter Anvin Cc: Rafael J. Wysocki Cc: Stefano Stabellini Cc: Yinghai Lu LKML-Reference: <201104020154.57136.rjw@sisk.pl> --- arch/x86/kernel/setup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5a0484a95ad6..4be9b398470e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -976,6 +976,11 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + if (boot_cpu_data.cpuid_level >= 0) { + /* A CPU has %cr4 if and only if it has CPUID */ + mmu_cr4_features = read_cr4(); + } + #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, -- cgit v1.2.2 From f994d99cf140dbb637e49882891c89b3fd84becd Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 6 Apr 2011 18:06:43 +0200 Subject: x86-32, fpu: Fix FPU exception handling on non-SSE systems On 32bit systems without SSE (that is, they use FSAVE/FRSTOR for FPU context switches), FPU exceptions in user mode cause Oopses, BUGs, recursive faults and other nasty things: fpu exception: 0000 [#1] last sysfs file: /sys/power/state Modules linked in: psmouse evdev pcspkr serio_raw [last unloaded: scsi_wait_scan] Pid: 1638, comm: fxsave-32-excep Not tainted 2.6.35-07798-g58a992b-dirty #633 VP3-596B-DD/VT82C597 EIP: 0060:[] EFLAGS: 00010202 CPU: 0 EIP is at math_error+0x1b4/0x1c8 EAX: 00000003 EBX: cf9be7e0 ECX: 00000000 EDX: cf9c5c00 ESI: cf9d9fb4 EDI: c1372db3 EBP: 00000010 ESP: cf9d9f1c DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 Process fxsave-32-excep (pid: 1638, ti=cf9d8000 task=cf9be7e0 task.ti=cf9d8000) Stack: 00000000 00000301 00000004 00000000 00000000 cf9d3000 cf9da8f0 00000001 <0> 00000004 cf9b6b60 c1019a6b c1019a79 00000020 00000242 000001b6 cf9c5380 <0> cf806b40 cf791880 00000000 00000282 00000282 c108a213 00000020 cf9c5380 Call Trace: [] ? need_resched+0x11/0x1a [] ? should_resched+0x5/0x1f [] ? do_sys_open+0xbd/0xc7 [] ? do_sys_open+0xbd/0xc7 [] ? do_coprocessor_error+0x0/0x11 [] ? error_code+0x65/0x70 Code: a8 20 74 30 c7 44 24 0c 06 00 03 00 8d 54 24 04 89 d9 b8 08 00 00 00 e8 9b 6d 02 00 eb 16 8b 93 5c 02 00 00 eb 05 e9 04 ff ff ff <9b> dd 32 9b e9 16 ff ff ff 81 c4 84 00 00 00 5b 5e 5f 5d c3 c6 EIP: [] math_error+0x1b4/0x1c8 SS:ESP 0068:cf9d9f1c This usually continues in slight variations until the system is reset. This bug was introduced by commit 58a992b9cbaf449aeebd3575c3695a9eb5d95b5e: x86-32, fpu: Rewrite fpu_save_init() Signed-off-by: Hans Rosenfeld Link: http://lkml.kernel.org/r/1302106003-366952-1-git-send-email-hans.rosenfeld@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ef328901c802..c9e09ea05644 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -237,7 +237,7 @@ static inline void fpu_save_init(struct fpu *fpu) } else if (use_fxsr()) { fpu_fxsave(fpu); } else { - asm volatile("fsave %[fx]; fwait" + asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state->fsave)); return; } -- cgit v1.2.2 From 09552b2696896dbb715be0caf91b23276f9139ba Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 7 Apr 2011 09:39:49 +0800 Subject: x86/mrst/vrtc: Fix boot crash in mrst_rtc_init() The sfi_mrtc_array[] only gets initialized when the sfi mrtc table is parsed, so the vrtc_paddr should be initalized after it too. Signed-off-by: Feng Tang Signed-off-by: Alan Cox Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1302140389-27603-1-git-send-email-feng.tang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/vrtc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index 04cf645feb92..73d70d65e76e 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -100,9 +100,11 @@ int vrtc_set_mmss(unsigned long nowtime) void __init mrst_rtc_init(void) { - unsigned long vrtc_paddr = sfi_mrtc_array[0].phys_addr; + unsigned long vrtc_paddr; sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); + + vrtc_paddr = sfi_mrtc_array[0].phys_addr; if (!sfi_mrtc_num || !vrtc_paddr) return; -- cgit v1.2.2 From 30d746c68025ee69ac17219aacc9b1614d951f01 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 7 Apr 2011 14:13:15 +0200 Subject: x86/ce4100: Add reg property to bridges without the reg property Ben's new code won't find the PCI & ISA bridge and the devices won't get the DT-node attached. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Thomas Gleixner Cc: davem@davemloft.net Cc: monstr@monstr.eu Cc: Benjamin Herrenschmidt Link: http://lkml.kernel.org/r/20110407121315.GA9204@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/platform/ce4100/falconfalls.dts | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts index dc701ea58546..2d6d226f2b10 100644 --- a/arch/x86/platform/ce4100/falconfalls.dts +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -74,6 +74,7 @@ compatible = "intel,ce4100-pci", "pci"; device_type = "pci"; bus-range = <1 1>; + reg = <0x0800 0x0 0x0 0x0 0x0>; ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>; interrupt-parent = <&ioapic2>; @@ -412,6 +413,7 @@ #address-cells = <2>; #size-cells = <1>; compatible = "isa"; + reg = <0xf800 0x0 0x0 0x0 0x0>; ranges = <1 0 0 0 0 0x100>; rtc@70 { -- cgit v1.2.2 From d419e4c0f7584ffc5c72d9aeeaac485cc756ebcf Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Mon, 11 Apr 2011 22:54:48 +0200 Subject: fix XEN_SAVE_RESTORE Kconfig dependencies Make XEN_SAVE_RESTORE select HIBERNATE_CALLBACKS. Remove XEN_SAVE_RESTORE dependency from PM_SLEEP. Signed-off-by: Shriram Rajagopalan Acked-by: Ian Campbell Signed-off-by: Rafael J. Wysocki --- arch/x86/xen/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 1c7121ba18ff..5cc821cb2e09 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -39,6 +39,7 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool depends on XEN + select HIBERNATE_CALLBACKS default y config XEN_DEBUG_FS -- cgit v1.2.2 From 9d90e49da57fe73a2f35334fdd2fb60dbf3933ed Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 8 Apr 2011 11:23:00 -0700 Subject: x86/mrst: Fix boot crash caused by incorrect pin to irq mapping Moorestown systems crash on boot because the secondary CPU clockevent (apbt1) will fail to request irq#1, which does not have ioapic chip in its irq_desc[] entry. Background: Moorestown platform does not have ISA bus nor legacy IRQs. It reuses the range of legacy IRQs for regular device interrupts. The routing information of early system device IRQs (timers) are obtained from firmware provided SFI tables. We reuse/fake MP configuration table to facilitate IRQ setup with IOAPIC. Maintaining a 1:1 mapping of IOAPIC pin (RTE entry) and IRQ# makes routing information clean and easy to understand on Moorestown. Though optional. This patch allows SFI timer and vRTC IRQ to be treated as ISA IRQ so that pin2irq mapping will be 1:1. Also fixed MP table type and use macros to clearly set MP IRQ entries. As a result, apbt timer and RTC interrupts on Moorestown are within legacy IRQ range: # cat /proc/interrupts CPU0 CPU1 0: 11249 0 IO-APIC-edge apbt0 1: 0 12271 IO-APIC-edge apbt1 8: 887 0 IO-APIC-fasteoi dw_spi 13: 0 0 IO-APIC-fasteoi INTEL_MID_DMAC2 14: 0 0 IO-APIC-fasteoi rtc0 Further discussion of this patch can be found at: https://lkml.org/lkml/2010/6/10/70 Suggested-by: "Eric W. Biederman" Signed-off-by: Jacob Pan Cc: Feng Tang Cc: Alan Cox Cc: Arjan van de Ven Link: http://lkml.kernel.org/r/1302286980-21139-1-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 5c0207bf959b..275dbc19e2cf 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -97,11 +97,11 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table) pentry->freq_hz, pentry->irq); if (!pentry->irq) continue; - mp_irq.type = MP_IOAPIC; + mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ mp_irq.irqflag = 5; - mp_irq.srcbus = 0; + mp_irq.srcbus = MP_BUS_ISA; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; @@ -168,10 +168,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->irq); - mp_irq.type = MP_IOAPIC; + mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; mp_irq.irqflag = 0xf; /* level trigger and active low */ - mp_irq.srcbus = 0; + mp_irq.srcbus = MP_BUS_ISA; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; @@ -282,7 +282,7 @@ void __init x86_mrst_early_setup(void) /* Avoid searching for BIOS MP tables */ x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; - + set_bit(MP_BUS_ISA, mp_bus_not_pci); } /* -- cgit v1.2.2 From 7d6b46707f2491a94f4bd3b4329d2d7f809e9368 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 15 Apr 2011 20:39:01 +0900 Subject: x86, NUMA: Fix fakenuma boot failure Currently, numa=fake boot parameter is broken. If it's used, kernel may panic due to devide by zero error depending on CPU configuration Call Trace: [] find_busiest_group+0x38c/0xd30 [] ? local_clock+0x6f/0x80 [] load_balance+0xa3/0x600 [] idle_balance+0xf3/0x180 [] schedule+0x722/0x7d0 [] ? wait_for_common+0x128/0x190 [] schedule_timeout+0x265/0x320 [] ? lock_release_holdtime+0x35/0x1a0 [] ? wait_for_common+0x128/0x190 [] ? __lock_release+0x9c/0x1d0 [] ? _raw_spin_unlock_irq+0x30/0x40 [] ? _raw_spin_unlock_irq+0x30/0x40 [] wait_for_common+0x130/0x190 [] ? try_to_wake_up+0x510/0x510 [] wait_for_completion+0x1d/0x20 [] kthread_create_on_node+0xac/0x150 [] ? process_scheduled_works+0x40/0x40 [] ? wait_for_common+0x4f/0x190 [] __alloc_workqueue_key+0x1a3/0x590 [] cpuset_init_smp+0x6b/0x7b [] kernel_init+0xc3/0x182 [] kernel_thread_helper+0x4/0x10 [] ? retint_restore_args+0x13/0x13 [] ? start_kernel+0x400/0x400 [] ? gs_change+0x13/0x13 The divede by zero is caused by the following line, group->cpu_power==0: kernel/sched_fair.c::update_sg_lb_stats() /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; This regression was caused by commit e23bba6044 ("x86-64, NUMA: Unify emulated distance mapping") because it changes cpu -> node mapping in the process of dropping fake_physnodes(). old) all cpus are assinged node 0 now) cpus are assigned round robin (the logic is implemented by numa_init_array()) Note: The change in behavior only happens if the system doesn't have neither ACPI SRAT table nor AMD northbridge NUMA information. Round robin assignment doesn't work because init_numa_sched_groups_power() assumes all logical cpus in the same physical cpu share the same node (then it only accounts for group_first_cpu()), and the simple round robin breaks the above assumption. Thus, this patch implements a reassignment of node-ids if buggy firmware or numa emulation makes wrong cpu node map. Tt enforce all logical cpus in the same physical cpu share the same node. Signed-off-by: KOSAKI Motohiro Acked-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/20110415203928.1303.A69D9226@jp.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2871d3c71b6..8ed8908cc9f7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -312,6 +312,26 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } +static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2) +{ + int node1 = early_cpu_to_node(cpu1); + int node2 = early_cpu_to_node(cpu2); + + /* + * Our CPU scheduler assumes all logical cpus in the same physical cpu + * share the same node. But, buggy ACPI or NUMA emulation might assign + * them to different node. Fix it. + */ + if (node1 != node2) { + pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n", + cpu1, node1, cpu2, node2, node2); + + numa_remove_cpu(cpu1); + numa_set_node(cpu1, node2); + numa_add_cpu(cpu1); + } +} + static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); @@ -320,6 +340,7 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2) cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); + check_cpu_siblings_on_same_node(cpu1, cpu2); } @@ -361,10 +382,12 @@ void __cpuinit set_cpu_sibling_map(int cpu) per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); + check_cpu_siblings_on_same_node(cpu, i); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(i)); + check_cpu_siblings_on_same_node(cpu, i); /* * Does this new cpu bringup a new core? */ -- cgit v1.2.2 From 5bbc097d890409d8eff4e3f1d26f11a9d6b7c07e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 15 Apr 2011 14:47:40 +0200 Subject: x86, amd: Disable GartTlbWlkErr when BIOS forgets it This patch disables GartTlbWlk errors on AMD Fam10h CPUs if the BIOS forgets to do is (or is just too old). Letting these errors enabled can cause a sync-flood on the CPU causing a reboot. The AMD BKDG recommends disabling GART TLB Wlk Error completely. This patch is the fix for https://bugzilla.kernel.org/show_bug.cgi?id=33012 on my machine. Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/20110415131152.GJ18463@8bytes.org Tested-by: Alexandre Demers Cc: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 4 ++++ arch/x86/kernel/cpu/amd.c | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index fd5a1f365c95..3cce71413d0b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -96,11 +96,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_AMD64_MC0_MASK 0xc0010044 + #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3ecece0217ef..3532d3bf8105 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -615,6 +615,25 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* As a rule processors have APIC timer running in deep C states */ if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400)) set_cpu_cap(c, X86_FEATURE_ARAT); + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + */ + if (c->x86 == 0x10) { + /* + * BIOS should disable GartTlbWlk Errors themself. If + * it doesn't do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + u64 mask; + + rdmsrl(MSR_AMD64_MCx_MASK(4), mask); + mask |= (1 << 10); + wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } } #ifdef CONFIG_X86_32 -- cgit v1.2.2 From af289bfe15fc92ecfbf6d8312713815b33e452c0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 Apr 2011 15:45:44 +0200 Subject: x86, gart: Convert spaces to tabs in enable_gart_translation Probably by copy&paste this function was indented by spaces. Convert this to tabs. Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/1303134346-5805-3-git-send-email-joerg.roedel@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/gart.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 43085bfc99c3..88c1ebee0db2 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -75,17 +75,17 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) { u32 tmp, ctl; - /* address of the mappings table */ - addr >>= 12; - tmp = (u32) addr<<4; - tmp &= ~0xf; - pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); - - /* Enable GART translation for this hammer. */ - pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl |= GARTEN; - ctl &= ~(DISGARTCPU | DISGARTIO); - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + /* address of the mappings table */ + addr >>= 12; + tmp = (u32) addr<<4; + tmp &= ~0xf; + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); + + /* Enable GART translation for this hammer. */ + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + ctl |= GARTEN; + ctl &= ~(DISGARTCPU | DISGARTIO); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) -- cgit v1.2.2 From c34151a742d84ae65db2088ea30495063f697fbe Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 Apr 2011 15:45:45 +0200 Subject: x86, gart: Set DISTLBWALKPRB bit always The DISTLBWALKPRB bit must be set for the GART because the gatt table is mapped UC. But the current code does not set the bit at boot when the BIOS setup the aperture correctly. Fix that by setting this bit when enabling the GART instead of the other places. Cc: Cc: Borislav Petkov Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/1303134346-5805-4-git-send-email-joerg.roedel@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/gart.h | 4 ++-- arch/x86/kernel/aperture_64.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 88c1ebee0db2..156cd5d18d2a 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -66,7 +66,7 @@ static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) * Don't enable translation but enable GART IO and CPU accesses. * Also, set DISTLBWALKPRB since GART tables memory is UC. */ - ctl = DISTLBWALKPRB | order << 1; + ctl = order << 1; pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } @@ -83,7 +83,7 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) /* Enable GART translation for this hammer. */ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl |= GARTEN; + ctl |= GARTEN | DISTLBWALKPRB; ctl &= ~(DISGARTCPU | DISGARTIO); pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 86d1ad4962a7..73fb469908c6 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -499,7 +499,7 @@ out: * Don't enable translation yet but enable GART IO and CPU * accesses and set DISTLBWALKPRB since GART table memory is UC. */ - u32 ctl = DISTLBWALKPRB | aper_order << 1; + u32 ctl = aper_order << 1; bus = amd_nb_bus_dev_ranges[i].bus; dev_base = amd_nb_bus_dev_ranges[i].dev_base; -- cgit v1.2.2 From 665d3e2af83c8fbd149534db8f57d82fa6fa6753 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 Apr 2011 15:45:46 +0200 Subject: x86, gart: Make sure GART does not map physmem above 1TB The GART can only map physical memory below 1TB. Make sure the gart driver in the kernel does not try to map memory above 1TB. Cc: Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/1303134346-5805-5-git-send-email-joerg.roedel@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/pci-gart_64.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 82ada01625b9..b117efd24f71 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -81,6 +81,9 @@ static u32 gart_unmapped_entry; #define AGPEXTERN #endif +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + /* backdoor interface to AGP driver */ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; @@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); - unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); + unsigned long iommu_page; int i; + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_addr; + + iommu_page = alloc_iommu(dev, npages, align_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; -- cgit v1.2.2 From 83112e688f5f05dea1e63787db9a6c16b2887a1d Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Sat, 16 Apr 2011 02:27:53 +0200 Subject: perf, x86: Fix pre-defined cache-misses event for AMD family 15h cpus With AMD cpu family 15h a unit mask was introduced for the Data Cache Miss event (0x041/L1-dcache-load-misses). We need to enable bit 0 (first data cache miss or streaming store to a 64 B cache line) of this mask to proper count data cache misses. Now we set this bit for all families and models. In case a PMU does not implement a unit mask for event 0x041 the bit is ignored. Signed-off-by: Andre Przywara Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302913676-14352-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 461f62bbd774..4e1613845b9f 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(L1D) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ -- cgit v1.2.2 From 855357a21744e488cbee23a47d2b124035160a87 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sat, 16 Apr 2011 02:27:54 +0200 Subject: perf, x86: Fix AMD family 15h FPU event constraints Depending on the unit mask settings some FPU events may be scheduled only on cpu counter #3. This patch fixes this. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1302913676-14352-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4e1613845b9f..cf4e369cea67 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -427,7 +427,9 @@ static __initconst const struct x86_pmu amd_pmu = { * * Exceptions: * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) * 0x00B FP PERF_CTL[3] * 0x00D FP PERF_CTL[3] * 0x023 DE PERF_CTL[2:0] @@ -448,6 +450,8 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x0DF LS PERF_CTL[5:0] * 0x1D6 EX PERF_CTL[5:0] * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used */ static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); @@ -460,18 +464,28 @@ static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); static struct event_constraint * amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) { - unsigned int event_code = amd_get_event_code(&event->hw); + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); switch (event_code & AMD_EVENT_TYPE_MASK) { case AMD_EVENT_FP: switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; case 0x003: case 0x00B: case 0x00D: return &amd_f15_PMC3; - default: - return &amd_f15_PMC53; } + return &amd_f15_PMC53; case AMD_EVENT_LS: case AMD_EVENT_DC: case AMD_EVENT_EX_LS: -- cgit v1.2.2 From 19234c0819da0e043a02710488dfd9b242b42eba Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Apr 2011 00:36:11 +0200 Subject: PM: Add missing syscore_suspend() and syscore_resume() calls Device suspend/resume infrastructure is used not only by the suspend and hibernate code in kernel/power, but also by APM, Xen and the kexec jump feature. However, commit 40dc166cb5dddbd36aa4ad11c03915ea (PM / Core: Introduce struct syscore_ops for core subsystems PM) failed to add syscore_suspend() and syscore_resume() calls to that code, which generally leads to breakage when the features in question are used. To fix this problem, add the missing syscore_suspend() and syscore_resume() calls to arch/x86/kernel/apm_32.c, kernel/kexec.c and drivers/xen/manage.c. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Ian Campbell --- arch/x86/kernel/apm_32.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0b4be431c620..adee12e0da1f 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -228,6 +228,7 @@ #include #include #include +#include #include #include @@ -1238,6 +1239,7 @@ static int suspend(int vetoable) local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); @@ -1255,6 +1257,7 @@ static int suspend(int vetoable) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; + syscore_resume(); sysdev_resume(); local_irq_enable(); @@ -1280,6 +1283,7 @@ static void standby(void) local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1287,6 +1291,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); + syscore_resume(); sysdev_resume(); local_irq_enable(); -- cgit v1.2.2 From 24bdb0b62cc82120924762ae6bc85afc8c3f2b26 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 12 Apr 2011 12:19:52 +0100 Subject: xen: do not create the extra e820 region at an addr lower than 4G Do not add the extra e820 region at a physical address lower than 4G because it breaks e820_end_of_low_ram_pfn(). It is OK for us to move the xen_extra_mem_start up and down because this is the index of the memory that can be ballooned in/out - it is memory not available to the kernel during bootup. Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index fa0269a99377..90bac0aac3a5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -227,7 +227,7 @@ char * __init xen_memory_setup(void) memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; - xen_extra_mem_start = mem_end; + xen_extra_mem_start = max((1ULL << 32), mem_end); for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; -- cgit v1.2.2 From ee176455e28469e2420032aab3db11ac2ae3eaa8 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 19 Apr 2011 14:47:31 +0100 Subject: xen: mask_rw_pte: do not apply the early_ioremap checks on x86_32 The two "is_early_ioremap_ptep" checks in mask_rw_pte are only used on x86_64, in fact early_ioremap is not used at all to setup the initial pagetable on x86_32. Moreover on x86_32 the two checks are wrong because the range pgt_buf_start..pgt_buf_end initially should be mapped RW because the pages in the range are not pagetable pages yet and haven't been cleared yet. Afterwards considering the pgt_buf_start..pgt_buf_end is part of the initial mapping, xen_alloc_pte is capable of turning the ptes RO when they become pagetable pages. Fix the issue and improve the readability of the code providing two different implementation of mask_rw_pte for x86_32 and x86_64. Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a991b57f91fe..aef7af92b28b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1473,16 +1473,20 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - -#ifdef CONFIG_X86_32 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ if (pte_val_ma(*ptep) & _PAGE_PRESENT) pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & pte_val_ma(pte)); -#endif + + return pte; +} +#else /* CONFIG_X86_64 */ +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); /* * If the new pfn is within the range of the newly allocated @@ -1497,6 +1501,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) return pte; } +#endif /* CONFIG_X86_64 */ /* Init-time set_pte while constructing initial pagetables, which doesn't allow RO pagetable pages to be remapped RW */ -- cgit v1.2.2 From 37f8527dbfd05af0f670aa02370d0c4cca7fbda6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 20 Apr 2011 19:19:10 -0700 Subject: Revert "x86, NUMA: Fix fakenuma boot failure" Andreas Herrmann reported that 7d6b46707f24 ("x86, NUMA: Fix fakenuma boot failure") causes certain physical NUMA topologies (for example AMD Magny-Cours) to move sibling cpus to a single node when in reality they are in separate domains. This may result in some nodes being completely void of cpus, which doesn't accurately represent the correct topology. The system will boot, but will have suboptimal NUMA performance. This commit was intended as a fix for NUMA emulation, but should not cause a regression for real NUMA machines as a side effect. ( There will be a separate fix for the numa-debug code, which will not affect physical topologies. ) Reported-by: Andreas Herrmann Signed-off-by: David Rientjes Acked-by: KOSAKI Motohiro Cc: Tejun Heo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1104201918110.12634@chino.kir.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8ed8908cc9f7..c2871d3c71b6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -312,26 +312,6 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2) -{ - int node1 = early_cpu_to_node(cpu1); - int node2 = early_cpu_to_node(cpu2); - - /* - * Our CPU scheduler assumes all logical cpus in the same physical cpu - * share the same node. But, buggy ACPI or NUMA emulation might assign - * them to different node. Fix it. - */ - if (node1 != node2) { - pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n", - cpu1, node1, cpu2, node2, node2); - - numa_remove_cpu(cpu1); - numa_set_node(cpu1, node2); - numa_add_cpu(cpu1); - } -} - static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); @@ -340,7 +320,6 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2) cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); - check_cpu_siblings_on_same_node(cpu1, cpu2); } @@ -382,12 +361,10 @@ void __cpuinit set_cpu_sibling_map(int cpu) per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); - check_cpu_siblings_on_same_node(cpu, i); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(i)); - check_cpu_siblings_on_same_node(cpu, i); /* * Does this new cpu bringup a new core? */ -- cgit v1.2.2 From 7a6c6547825a2324faa76cff856db11d78de075e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 20 Apr 2011 19:19:13 -0700 Subject: x86, numa: Fix cpu nodemasks for NUMA emulation and CONFIG_DEBUG_PER_CPU_MAPS The cpu<->node mappings under CONFIG_DEBUG_PER_CPU_MAPS=y when NUMA emulation is enabled is currently broken because it does not iterate through every emulated node and bind cpus that have affinity to it. NUMA emulation should bind each cpu to every local node to accurately represent the true NUMA topology of the underlying machine. debug_cpumask_set_cpu() needs to be fixed at the same time so that the debugging information that it emits shows the new cpumask of the node being assigned when the cpu is being added or removed. It can now take responsibility of setting or clearing the cpu itself to remove the need for duplicate code. Also change its last parameter, "enable", to have the correct bool type since it can only be true or false. -v2: Fix the return statements, by Kosaki Motohiro Acked-and-Tested-by: KOSAKI Motohiro Signed-off-by: David Rientjes Cc: Andreas Herrmann Cc: Tejun Heo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1104201918470.12634@chino.kir.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/numa.h | 2 +- arch/x86/mm/numa.c | 31 +++++++++++++------------------ arch/x86/mm/numa_emulation.c | 20 ++++++-------------- 3 files changed, 20 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 3d4dab43c994..a50fc9f493b3 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -51,7 +51,7 @@ static inline void numa_remove_cpu(int cpu) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS -struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable); +void debug_cpumask_set_cpu(int cpu, int node, bool enable); #endif #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 9559d360fde7..745258dfc4dc 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -213,53 +213,48 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } -struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) +void debug_cpumask_set_cpu(int cpu, int node, bool enable) { - int node = early_cpu_to_node(cpu); struct cpumask *mask; char buf[64]; if (node == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ - return NULL; + return; } mask = node_to_cpumask_map[node]; if (!mask) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); - return NULL; + return; } + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + cpulist_scnprintf(buf, sizeof(buf), mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); - return mask; + return; } # ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, int enable) +static void __cpuinit numa_set_cpumask(int cpu, bool enable) { - struct cpumask *mask; - - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); + debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } void __cpuinit numa_add_cpu(int cpu) { - numa_set_cpumask(cpu, 1); + numa_set_cpumask(cpu, true); } void __cpuinit numa_remove_cpu(int cpu) { - numa_set_cpumask(cpu, 0); + numa_set_cpumask(cpu, false); } # endif /* !CONFIG_NUMA_EMU */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index ad091e4cff17..de84cc140379 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -454,10 +454,9 @@ void __cpuinit numa_remove_cpu(int cpu) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) +static void __cpuinit numa_set_cpumask(int cpu, bool enable) { - struct cpumask *mask; - int nid, physnid, i; + int nid, physnid; nid = early_cpu_to_node(cpu); if (nid == NUMA_NO_NODE) { @@ -467,28 +466,21 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) physnid = emu_nid_to_phys[nid]; - for_each_online_node(i) { + for_each_online_node(nid) { if (emu_nid_to_phys[nid] != physnid) continue; - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); + debug_cpumask_set_cpu(cpu, nid, enable); } } void __cpuinit numa_add_cpu(int cpu) { - numa_set_cpumask(cpu, 1); + numa_set_cpumask(cpu, true); } void __cpuinit numa_remove_cpu(int cpu) { - numa_set_cpumask(cpu, 0); + numa_set_cpumask(cpu, false); } #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -- cgit v1.2.2 From b2508e828d71baacd9a743dd48dcbf85d96affdd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 21 Apr 2011 16:48:35 -0700 Subject: perf: Support Xeon E7's via the Westmere PMU driver There's a new model number public, 47, for Xeon E7 (aka Westmere EX). Signed-off-by: Andi Kleen Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/1303429715-10202-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8fc2b2cee1da..586cced12d1f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1425,6 +1425,7 @@ static __init int intel_pmu_init(void) case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ + case 47: /* 32 nm Xeon E7 */ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, -- cgit v1.2.2 From b52c55c6a25e4515b5e075a989ff346fc251ed09 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 Apr 2011 08:44:38 +0200 Subject: x86, perf event: Turn off unstructured raw event access to offcore registers Andi Kleen pointed out that the Intel offcore support patches were merged without user-space tool support to the functionality: | | The offcore_msr perf kernel code was merged into 2.6.39-rc*, but the | user space bits were not. This made it impossible to set the extra mask | and actually do the OFFCORE profiling | Andi submitted a preliminary patch for user-space support, as an extension to perf's raw event syntax: | | Some raw events -- like the Intel OFFCORE events -- support additional | parameters. These can be appended after a ':'. | | For example on a multi socket Intel Nehalem: | | perf stat -e r1b7:20ff -a sleep 1 | | Profile the OFFCORE_RESPONSE.ANY_REQUEST with event mask REMOTE_DRAM_0 | that measures any access to DRAM on another socket. | But this kind of usability is absolutely unacceptable - users should not be expected to type in magic, CPU and model specific incantations to get access to useful hardware functionality. The proper solution is to expose useful offcore functionality via generalized events - that way users do not have to care which specific CPU model they are using, they can use the conceptual event and not some model specific quirky hexa number. We already have such generalization in place for CPU cache events, and it's all very extensible. "Offcore" events measure general DRAM access patters along various parameters. They are particularly useful in NUMA systems. We want to support them via generalized DRAM events: either as the fourth level of cache (after the last-level cache), or as a separate generalization category. That way user-space support would be very obvious, memory access profiling could be done via self-explanatory commands like: perf record -e dram ./myapp perf record -e dram-remote ./myapp ... to measure DRAM accesses or more expensive cross-node NUMA DRAM accesses. These generalized events would work on all CPUs and architectures that have comparable PMU features. ( Note, these are just examples: actual implementation could have more sophistication and more parameter - as long as they center around similarly simple usecases. ) Now we do not want to revert *all* of the current offcore bits, as they are still somewhat useful for generic last-level-cache events, implemented in this commit: e994d7d23a0b: perf: Fix LLC-* events on Intel Nehalem/Westmere But we definitely do not yet want to expose the unstructured raw events to user-space, until better generalization and usability is implemented for these hardware event features. ( Note: after generalization has been implemented raw offcore events can be supported as well: there can always be an odd event that is marginally useful but not useful enough to generalize. DRAM profiling is definitely *not* such a category so generalization must be done first. ) Furthermore, PERF_TYPE_RAW access to these registers was not intended to go upstream without proper support - it was a side-effect of the above e994d7d23a0b commit, not mentioned in the changelog. As v2.6.39 is nearing release we go for the simplest approach: disable the PERF_TYPE_RAW offcore hack for now, before it escapes into a released kernel and becomes an ABI. Once proper structure is implemented for these hardware events and users are offered usable solutions we can revisit this issue. Reported-by: Andi Kleen Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1302658203-4239-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index eed3673a8656..632e5dc9c9c0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -586,8 +586,12 @@ static int x86_setup_perfctr(struct perf_event *event) return -EOPNOTSUPP; } + /* + * Do not allow config1 (extended registers) to propagate, + * there's no sane user-space generalization yet: + */ if (attr->type == PERF_TYPE_RAW) - return x86_pmu_extra_regs(event->attr.config, event); + return 0; if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); -- cgit v1.2.2 From 1ea5a6afd95a4803900c97ed63a47a883ebe7b3e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 21 Apr 2011 11:03:21 -0400 Subject: perf, x86: P4 PMU - Don't forget to clear cpuc->active_mask on overflow It's not enough to simply disable event on overflow the cpuc->active_mask should be cleared as well otherwise counter may stall in "active" even in real being already disabled (which potentially may lead to the situation that user may not use this counter further). Don pointed out that: " I also noticed this patch fixed some unknown NMIs on a P4 when I stressed the box". Tested-by: Lin Ming Signed-off-by: Cyrill Gorcunov Acked-by: Don Zickus Signed-off-by: Don Zickus Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1303398203-2918-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index c2520e178d32..d1f77e2934a1 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -947,7 +947,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - p4_pmu_disable_event(event); + x86_pmu_stop(event, 0); } if (handled) { -- cgit v1.2.2 From f4929bd37208540c2c6f416e9035ff1938f2dbc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Apr 2011 13:39:56 +0200 Subject: perf, x86: Update/fix Intel Nehalem cache events Change the Nehalem cache events to use retired memory instruction counters (similar to Westmere), this greatly improves the provided stats. Using: main () { int i; for (i = 0; i < 1000000000; i++) { asm("mov (%%rsp), %%rbx;" "mov %%rbx, (%%rsp);" : : : "rbx"); } } We find: $ perf stat --repeat 10 -e instructions:u -e l1-dcache-loads:u -e l1-dcache-stores:u ./loop_1b_loads+stores Performance counter stats for './loop_1b_loads+stores' (10 runs): 4,000,081,056 instructions:u # 0.000 IPC ( +- 0.000% ) 4,999,502,846 l1-dcache-loads:u ( +- 0.008% ) 1,000,034,832 l1-dcache-stores:u ( +- 0.000% ) 1.565184942 seconds time elapsed ( +- 0.005% ) The 5b is surprising - we'd expect 1b: $ perf stat --repeat 10 -e instructions:u -e r10b:u -e l1-dcache-stores:u ./loop_1b_loads+stores Performance counter stats for './loop_1b_loads+stores' (10 runs): 4,000,081,054 instructions:u # 0.000 IPC ( +- 0.000% ) 1,000,021,961 r10b:u ( +- 0.000% ) 1,000,030,951 l1-dcache-stores:u ( +- 0.000% ) 1.565055422 seconds time elapsed ( +- 0.003% ) Which this patch thus fixes. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt Cc: Stephane Eranian Cc: Lin Ming Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/n/tip-q9rtru7b7840tws75xzboapv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 586cced12d1f..43fa20b13817 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -391,12 +391,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ -- cgit v1.2.2 From 39b68976ac653cfdc7f872a293e8b7928de2dcc6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Apr 2011 14:52:37 -0700 Subject: x86, setup: When probing memory with e801, use ax/bx as a pair When we use BIOS function e801 to probe memory, we should use ax/bx (or cx/dx) as a pair, not mix and match. This was a typo during the translation from assembly code, and breaks at least one set of machines in the field (which return cx = dx = 0). Reported-and-tested-by: Chris Samuel Fix-proposed-by: Thomas Meyer Link: http://lkml.kernel.org/r/1303566747.12067.10.camel@localhost.localdomain --- arch/x86/boot/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index cae3feb1035e..db75d07c3645 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -91,7 +91,7 @@ static int detect_memory_e801(void) if (oreg.ax > 15*1024) { return -1; /* Bogus! */ } else if (oreg.ax == 15*1024) { - boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; + boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax; } else { /* * This ignores memory above 16MB if we have a memory -- cgit v1.2.2 From 18a073a3acd3a47fbb5e23333df7fad28d576345 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Apr 2011 13:24:33 +0200 Subject: perf, x86: Fix BTS condition Currently the x86 backend incorrectly assumes that any BRANCH_INSN with sample_period==1 is a BTS request. This is not true when we do frequency driven profiling such as 'perf record -e branches'. Solves this error: $ perf record -e branches ./array Error: sys_perf_event_open() syscall returned with 95 (Operation not supported). Signed-off-by: Peter Zijlstra Reported-by: Ingo Molnar Cc: "Metzger, Markus T" Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-rd2y4ct71hjawzz6fpvsy9hg@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- arch/x86/kernel/cpu/perf_event_intel.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 632e5dc9c9c0..fac0654021b8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -613,8 +613,8 @@ static int x86_setup_perfctr(struct perf_event *event) /* * Branch tracing: */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { /* BTS is not supported by this architecture. */ if (!x86_pmu.bts_active) return -EOPNOTSUPP; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 43fa20b13817..9194b0698d63 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -998,6 +998,9 @@ intel_bts_constraints(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; + if (event->attr.freq) + return NULL; + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); -- cgit v1.2.2 From ec75a71634dabe439db91c1ef51d5099f4493808 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 11:51:41 +0200 Subject: perf events, x86: Work around the Nehalem AAJ80 erratum On Nehalem CPUs the retired branch-misses event can be completely bogus, when there are no branch-misses occuring. When there are a lot of branch misses then the count is pretty accurate. Still, this leaves us with an event that over-counts a lot. Detect this erratum and work it around by using BR_MISP_EXEC.ANY events. These will also count speculated branches but still it's a lot more precise in practice than the architectural event. Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-yyfg0bxo9jsqxd6a0ovfny27@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9194b0698d63..9ae4a2aa7398 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -25,7 +25,7 @@ struct intel_percore { /* * Intel PerfMon, used on Core and later. */ -static const u64 intel_perfmon_event_map[] = +static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, @@ -1308,7 +1308,7 @@ static void intel_clovertown_quirks(void) * AJ106 could possibly be worked around by not allowing LBR * usage from PEBS, including the fixup. * AJ68 could possibly be worked around by always programming - * a pebs_event_reset[0] value and coping with the lost events. + * a pebs_event_reset[0] value and coping with the lost events. * * But taken together it might just make sense to not enable PEBS on * these chips. @@ -1412,6 +1412,18 @@ static __init int intel_pmu_init(void) x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + + if (ebx & 0x40) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + + pr_cont("erratum AAJ80 worked around, "); + } pr_cont("Nehalem events, "); break; -- cgit v1.2.2 From 2bce5daca28346f19c190dbdb5542c9fe3e8c6e6 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 27 Apr 2011 06:32:33 -0400 Subject: perf, x86, nmi: Move LVT un-masking into irq handlers It was noticed that P4 machines were generating double NMIs for each perf event. These extra NMIs lead to 'Dazed and confused' messages on the screen. I tracked this down to a P4 quirk that said the overflow bit had to be cleared before re-enabling the apic LVT mask. My first attempt was to move the un-masking inside the perf nmi handler from before the chipset NMI handler to after. This broke Nehalem boxes that seem to like the unmasking before the counters themselves are re-enabled. In order to keep this change simple for 2.6.39, I decided to just simply move the apic LVT un-masking to the beginning of all the chipset NMI handlers, with the exception of Pentium4's to fix the double NMI issue. Later on we can move the un-masking to later in the handlers to save a number of 'extra' NMIs on those particular chipsets. I tested this change on a P4 machine, an AMD machine, a Nehalem box, and a core2quad box. 'perf top' worked correctly along with various other small 'perf record' runs. Anything high stress breaks all the machines but that is a different problem. Thanks to various people for testing different versions of this patch. Reported-and-tested-by: Shaun Ruffell Signed-off-by: Don Zickus Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1303900353-10242-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar CC: Cyrill Gorcunov --- arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++-- arch/x86/kernel/cpu/perf_event_intel.c | 10 ++++++++++ arch/x86/kernel/cpu/perf_event_p4.c | 17 +++++++++++++---- 3 files changed, 33 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fac0654021b8..e638689279d3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1288,6 +1288,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This generic handler doesn't seem to have any issues where the + * unmasking occurs so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) { /* @@ -1374,8 +1384,6 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_DONE; } - apic_write(APIC_LVTPC, APIC_DM_NMI); - handled = x86_pmu.handle_irq(args->regs); if (!handled) return NOTIFY_DONE; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9ae4a2aa7398..e61539b07d2c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -933,6 +933,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This handler doesn't seem to have any issues with the unmasking + * so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index d1f77e2934a1..e93fcd55fae1 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -950,11 +950,20 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) x86_pmu_stop(event, 0); } - if (handled) { - /* p4 quirk: unmask it again */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + if (handled) inc_irq_stat(apic_perf_irqs); - } + + /* + * When dealing with the unmasking of the LVTPC on P4 perf hw, it has + * been observed that the OVF bit flag has to be cleared first _before_ + * the LVTPC can be unmasked. + * + * The reason is the NMI line will continue to be asserted while the OVF + * bit is set. This causes a second NMI to generate if the LVTPC is + * unmasked before the OVF bit is cleared, leading to unknown NMI + * messages. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } -- cgit v1.2.2 From 20443598d9bdfe3563f901e27fd482a3f5d3d231 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 Apr 2011 16:30:52 +0200 Subject: x86: devicetree: Configure IOAPIC pin only once We use io_apic_setup_irq_pin() in order to configure pin's interrupt number polarity and type. This is done on every irq_create_of_mapping() which happens for instance during pci enable calls. Level typed interrupts are masked by default, edge are unmasked. On the first ->xlate() call the level interrupt is configured and masked. The driver calls request_irq() and the line is unmasked. Lets assume the interrupt line is shared with another device and we call pci_enable_device() for this device. The ->xlate() configures the pin again and it is masked. request_irq() does not unmask the line because it _is_ already unmasked according to its internal state. So the interrupt will never be unmasked again. This patch is based on an earlier work by Torben Hohn and solves the problem by configuring the pin only once. Since all devices must agree on the same type and polarity there is no point in configuring the pin more than once. [ tglx: Split out the ce4100 part into a separate patch ] Cc: Torben Hohn Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/%3C20110427143052.GA15211%40linutronix.de%3E Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 2 +- arch/x86/kernel/apic/io_apic.c | 10 +++++----- arch/x86/kernel/devicetree.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index c4bd267dfc50..a97a240f67f3 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -150,7 +150,7 @@ void setup_IO_APIC_irq_extra(u32 gsi); extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); -int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr); +int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 68df09bba92e..45fd33d1fd3a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -128,8 +128,8 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -static int io_apic_setup_irq_pin_once(unsigned int irq, int node, - struct io_apic_irq_attr *attr); +static int io_apic_setup_irq_pin(unsigned int irq, int node, + struct io_apic_irq_attr *attr); /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) @@ -3570,7 +3570,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -int +static int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) { struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); @@ -3585,8 +3585,8 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return ret; } -static int io_apic_setup_irq_pin_once(unsigned int irq, int node, - struct io_apic_irq_attr *attr) +int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) { unsigned int id = attr->ioapic, pin = attr->ioapic_pin; int ret; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 706a9fb46a58..e90f08458e6b 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -391,7 +391,7 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr); + return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); } static void __init ioapic_add_ofnode(struct device_node *np) -- cgit v1.2.2 From 1ff42c32c7614c2e810ed388fd1ba04a5626b74c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 Apr 2011 16:30:52 +0200 Subject: x86: ce4100: Configure IOAPIC pins for USB and SATA to level type The USB and SATA ioapic interrrupt pins are configured as edge type, but need to be level type interrupts to work correctly. [ tglx: Split out from the combo patch ] Cc: Torben Hohn Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/%3C20110427143052.GA15211%40linutronix.de%3E Signed-off-by: Thomas Gleixner --- arch/x86/platform/ce4100/falconfalls.dts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts index 2d6d226f2b10..e70be38ce039 100644 --- a/arch/x86/platform/ce4100/falconfalls.dts +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -347,7 +347,7 @@ "pciclass0c03"; reg = <0x16800 0x0 0x0 0x0 0x0>; - interrupts = <22 3>; + interrupts = <22 1>; }; usb@d,1 { @@ -357,7 +357,7 @@ "pciclass0c03"; reg = <0x16900 0x0 0x0 0x0 0x0>; - interrupts = <22 3>; + interrupts = <22 1>; }; sata@e,0 { @@ -367,7 +367,7 @@ "pciclass0106"; reg = <0x17000 0x0 0x0 0x0 0x0>; - interrupts = <23 3>; + interrupts = <23 1>; }; flash@f,0 { -- cgit v1.2.2 From e20a2d205c05cef6b5783df339a7d54adeb50962 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 29 Apr 2011 17:47:43 -0400 Subject: x86, AMD: Fix APIC timer erratum 400 affecting K8 Rev.A-E processors Older AMD K8 processors (Revisions A-E) are affected by erratum 400 (APIC timer interrupts don't occur in C states greater than C1). This, for example, means that X86_FEATURE_ARAT flag should not be set for these parts. This addresses regression introduced by commit b87cf80af3ba4b4c008b4face3c68d604e1715c6 ("x86, AMD: Set ARAT feature on AMD processors") where the system may become unresponsive until external interrupt (such as keyboard input) occurs. This results, for example, in time not being reported correctly, lack of progress on the system and other lockups. Reported-by: Joerg-Volker Peetz Tested-by: Joerg-Volker Peetz Acked-by: Borislav Petkov Signed-off-by: Boris Ostrovsky Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1304113663-6586-1-git-send-email-ostr@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3532d3bf8105..bb9eb29a52dd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -698,7 +698,7 @@ cpu_dev_register(amd_cpu_dev); */ const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0x0f, 0x4, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); EXPORT_SYMBOL_GPL(amd_erratum_400); -- cgit v1.2.2 From 2be19102b71c1a45d37fec50303791daa1a06869 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 May 2011 19:12:04 +0200 Subject: x86, NUMA: Fix empty memblk detection in numa_cleanup_meminfo() numa_cleanup_meminfo() trims each memblk between low (0) and high (max_pfn) limits and discards empty ones. However, the emptiness detection incorrectly used equality test. If the start of a memblk is higher than max_pfn, it is empty but fails the equality test and doesn't get discarded. The condition triggers when max_pfn is lower than start of a NUMA node and results in memory misconfiguration - leading to WARN_ON()s and other funnies. The bug was discovered in devel branch where 32bit too uses this code path for NUMA init. If a node is above the addressing limit, max_pfn ends up lower than the node triggering this problem. The failure hasn't been observed on x86-64 but is still possible with broken hardware e820/NUMA info. As the fix is very low risk, it would be better to apply it even for 64bit. Fix it by using >= instead of ==. Signed-off-by: Yinghai Lu [ Extracted the actual fix from the original patch and rewrote patch description. ] Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/20110501171204.GO29280@htj.dyndns.org Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e8c00cc72033..85b52fc03084 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -306,7 +306,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) bi->end = min(bi->end, high); /* and there's no empty block */ - if (bi->start == bi->end) { + if (bi->start >= bi->end) { numa_remove_memblk_from(i--, mi); continue; } -- cgit v1.2.2 From a38647837a411f7df79623128421eef2118b5884 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 29 Apr 2011 11:34:00 -0400 Subject: xen/mmu: Add workaround "x86-64, mm: Put early page table high" As a consequence of the commit: commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e Author: Yinghai Lu Date: Fri Dec 17 16:58:28 2010 -0800 x86-64, mm: Put early page table high it causes the Linux kernel to crash under Xen: mapping kernel into physical memory Xen: setup ISA identity maps about to get started... (XEN) mm.c:2466:d0 Bad type (saw 7400000000000001 != exp 1000000000000000) for mfn b1d89 (pfn bacf7) (XEN) mm.c:3027:d0 Error while pinning mfn b1d89 (XEN) traps.c:481:d0 Unhandled invalid opcode fault/trap [#6] on VCPU 0 [ec=0000] (XEN) domain_crash_sync called from entry.S (XEN) Domain 0 (vcpu#0) crashed on cpu#0: ... The reason is that at some point init_memory_mapping is going to reach the pagetable pages area and map those pages too (mapping them as normal memory that falls in the range of addresses passed to init_memory_mapping as argument). Some of those pages are already pagetable pages (they are in the range pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and everything is fine. Some of these pages are not pagetable pages yet (they fall in the range pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they are going to be mapped RW. When these pages become pagetable pages and are hooked into the pagetable, xen will find that the guest has already a RW mapping of them somewhere and fail the operation. The reason Xen requires pagetables to be RO is that the hypervisor needs to verify that the pagetables are valid before using them. The validation operations are called "pinning" (more details in arch/x86/xen/mmu.c). In order to fix the issue we mark all the pages in the entire range pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation is completed only the range pgt_buf_start-pgt_buf_end is reserved by init_memory_mapping. Hence the kernel is going to crash as soon as one of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those ranges are RO). For this reason, this function is introduced which is called _after_ the init_memory_mapping has completed (in a perfect world we would call this function from init_memory_mapping, but lets ignore that). Because we are called _after_ init_memory_mapping the pgt_buf_[start, end,top] have all changed to new values (b/c another init_memory_mapping is called). Hence, the first time we enter this function, we save away the pgt_buf_start value and update the pgt_buf_[end,top]. When we detect that the "old" pgt_buf_start through pgt_buf_end PFNs have been reserved (so memblock_x86_reserve_range has been called), we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. And then we update those "old" pgt_buf_[end|top] with the new ones so that we can redo this on the next pagetable. Acked-by: "H. Peter Anvin" Reviewed-by: Jeremy Fitzhardinge [v1: Updated with Jeremy's comments] [v2: Added the crash output] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aef7af92b28b..1bca25f60ff2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } +#ifdef CONFIG_X86_64 +static __initdata u64 __last_pgt_set_rw = 0; +static __initdata u64 __pgt_buf_start = 0; +static __initdata u64 __pgt_buf_end = 0; +static __initdata u64 __pgt_buf_top = 0; +/* + * As a consequence of the commit: + * + * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e + * Author: Yinghai Lu + * Date: Fri Dec 17 16:58:28 2010 -0800 + * + * x86-64, mm: Put early page table high + * + * at some point init_memory_mapping is going to reach the pagetable pages + * area and map those pages too (mapping them as normal memory that falls + * in the range of addresses passed to init_memory_mapping as argument). + * Some of those pages are already pagetable pages (they are in the range + * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and + * everything is fine. + * Some of these pages are not pagetable pages yet (they fall in the range + * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they + * are going to be mapped RW. When these pages become pagetable pages and + * are hooked into the pagetable, xen will find that the guest has already + * a RW mapping of them somewhere and fail the operation. + * The reason Xen requires pagetables to be RO is that the hypervisor needs + * to verify that the pagetables are valid before using them. The validation + * operations are called "pinning". + * + * In order to fix the issue we mark all the pages in the entire range + * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation + * is completed only the range pgt_buf_start-pgt_buf_end is reserved by + * init_memory_mapping. Hence the kernel is going to crash as soon as one + * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those + * ranges are RO). + * + * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ + * the init_memory_mapping has completed (in a perfect world we would + * call this function from init_memory_mapping, but lets ignore that). + * + * Because we are called _after_ init_memory_mapping the pgt_buf_[start, + * end,top] have all changed to new values (b/c init_memory_mapping + * is called and setting up another new page-table). Hence, the first time + * we enter this function, we save away the pgt_buf_start value and update + * the pgt_buf_[end,top]. + * + * When we detect that the "old" pgt_buf_start through pgt_buf_end + * PFNs have been reserved (so memblock_x86_reserve_range has been called), + * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. + * + * And then we update those "old" pgt_buf_[end|top] with the new ones + * so that we can redo this on the next pagetable. + */ +static __init void mark_rw_past_pgt(void) { + + if (pgt_buf_end > pgt_buf_start) { + u64 addr, size; + + /* Save it away. */ + if (!__pgt_buf_start) { + __pgt_buf_start = pgt_buf_start; + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + return; + } + /* If we get the range that starts at __pgt_buf_end that means + * the range is reserved, and that in 'init_memory_mapping' + * the 'memblock_x86_reserve_range' has been called with the + * outdated __pgt_buf_start, __pgt_buf_end (the "new" + * pgt_buf_[start|end|top] refer now to a new pagetable. + * Note: we are called _after_ the pgt_buf_[..] have been + * updated.*/ + + addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), + &size, PAGE_SIZE); + + /* Still not reserved, meaning 'memblock_x86_reserve_range' + * hasn't been called yet. Update the _end and _top.*/ + if (addr == PFN_PHYS(__pgt_buf_start)) { + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + return; + } + + /* OK, the area is reserved, meaning it is time for us to + * set RW for the old end->top PFNs. */ + + /* ..unless we had already done this. */ + if (__pgt_buf_end == __last_pgt_set_rw) + return; + + addr = PFN_PHYS(__pgt_buf_end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", + PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); + + while (addr < PFN_PHYS(__pgt_buf_top)) { + make_lowmem_page_readwrite(__va(addr)); + addr += PAGE_SIZE; + } + /* And update everything so that we are ready for the next + * pagetable (the one created for regions past 4GB) */ + __last_pgt_set_rw = __pgt_buf_end; + __pgt_buf_start = pgt_buf_start; + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + } + return; +} +#else +static __init void mark_rw_past_pgt(void) { } +#endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 @@ -1488,6 +1601,14 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { unsigned long pfn = pte_pfn(pte); + /* + * A bit of optimization. We do not need to call the workaround + * when xen_set_pte_init is called with a PTE with 0 as PFN. + * That is b/c the pagetable at that point are just being populated + * with empty values and we can save some cycles by not calling + * the 'memblock' code.*/ + if (pfn) + mark_rw_past_pgt(); /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an @@ -1997,6 +2118,8 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { + mark_rw_past_pgt(); + #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif -- cgit v1.2.2 From b9269dc7bfdf8c985971c09f2dcb2aa04ad7986d Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 12 Apr 2011 12:19:49 +0100 Subject: xen: mask_rw_pte mark RO all pagetable pages up to pgt_buf_top mask_rw_pte is currently checking if a pfn is a pagetable page if it falls in the range pgt_buf_start - pgt_buf_end but that is incorrect because pgt_buf_end is a moving target: pgt_buf_top is the real boundary. Acked-by: "H. Peter Anvin" Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 1bca25f60ff2..55c965b38c27 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1616,7 +1616,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) * it is RO. */ if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_end)) || + pfn >= pgt_buf_start && pfn < pgt_buf_top)) || (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) pte = pte_wrprotect(pte); -- cgit v1.2.2 From 7806a49ab625ebeb1709e5e87299b64932b807a7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 2 May 2011 14:33:24 -0700 Subject: x86, reboot: Fix relocations in reboot_32.S The use of base for %ebx in this file is arbitrary, *except* that we also use it to compute the real-mode segment. Therefore, make it so that r_base really is the true address to which %ebx points. This resolves kernel bugzilla 33302. Reported-and-tested-by: Alexey Zaytsev Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-08os5wi3yq1no0y4i5m4z7he@git.kernel.org --- arch/x86/kernel/reboot_32.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S index 29092b38d816..1d5c46df0d78 100644 --- a/arch/x86/kernel/reboot_32.S +++ b/arch/x86/kernel/reboot_32.S @@ -21,26 +21,26 @@ r_base = . /* Get our own relocated address */ call 1f 1: popl %ebx - subl $1b, %ebx + subl $(1b - r_base), %ebx /* Compute the equivalent real-mode segment */ movl %ebx, %ecx shrl $4, %ecx /* Patch post-real-mode segment jump */ - movw dispatch_table(%ebx,%eax,2),%ax - movw %ax, 101f(%ebx) - movw %cx, 102f(%ebx) + movw (dispatch_table - r_base)(%ebx,%eax,2),%ax + movw %ax, (101f - r_base)(%ebx) + movw %cx, (102f - r_base)(%ebx) /* Set up the IDT for real mode. */ - lidtl machine_real_restart_idt(%ebx) + lidtl (machine_real_restart_idt - r_base)(%ebx) /* * Set up a GDT from which we can load segment descriptors for real * mode. The GDT is not used in real mode; it is just needed here to * prepare the descriptors. */ - lgdtl machine_real_restart_gdt(%ebx) + lgdtl (machine_real_restart_gdt - r_base)(%ebx) /* * Load the data segment registers with 16-bit compatible values -- cgit v1.2.2