From cbc34973709eb41b369c304c075cf2069f847012 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 13 Feb 2008 13:14:35 -0800 Subject: lguest: include function prototypes Added a declaration to asm-x86/lguest.h and moved the extern arrays there as well. As an alternative to including asm/lguest.h directly, an include could be put in linux/lguest.h Signed-off-by: Harvey Harrison Cc: "rusty@rustcorp.com.au" Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/lguest/boot.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5afdde4895dc..1e613fb03e32 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -75,15 +76,6 @@ * behaving in simplified but equivalent ways. In particular, the Guest is the * same kernel as the Host (or at least, built from the same source code). :*/ -/* Declarations for definitions in lguest_guest.S */ -extern char lguest_noirq_start[], lguest_noirq_end[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_sti[], lgend_sti[]; -extern const char lgstart_popf[], lgend_popf[]; -extern const char lgstart_pushf[], lgend_pushf[]; -extern const char lgstart_iret[], lgend_iret[]; -extern void lguest_iret(void); - struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, .noirq_start = (u32)lguest_noirq_start, -- cgit v1.2.2 From db342d216ba9e060d8c5501eefc1d0a789c9e711 Mon Sep 17 00:00:00 2001 From: Tony Breeds Date: Tue, 19 Feb 2008 08:16:03 +0100 Subject: lguest: fix build breakage [ mingo@elte.hu: merged to Rusty's patch ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/asm-offsets_32.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index a33d53017997..8ea040124f7d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -128,13 +128,11 @@ void foo(void) OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); #endif -#ifdef CONFIG_LGUEST_GUEST +#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); -#endif -#ifdef CONFIG_LGUEST BLANK(); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); -- cgit v1.2.2 From 1ce70c4fac3c3954bd48c035f448793867592bc0 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Sun, 24 Feb 2008 17:55:15 +0200 Subject: x86/lguest: fix pgdir pmd index calculation Hi all, Beginning from commits close to v2.6.25-rc2, running lguest always oopses the host kernel. Oops is at [1]. Bisection led to the following commit: commit 37cc8d7f963ba2deec29c9b68716944516a3244f x86/early_ioremap: don't assume we're using swapper_pg_dir At the early stages of boot, before the kernel pagetable has been fully initialized, a Xen kernel will still be running off the Xen-provided pagetables rather than swapper_pg_dir[]. Therefore, readback cr3 to determine the base of the pagetable rather than assuming swapper_pg_dir[]. static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { - pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)]; + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(addr)]; pud_t *pud = pud_offset(pgd, addr); pmd_t *pmd = pmd_offset(pud, addr); Trying to analyze the problem, it seems on the guest side of lguest, %cr3 has a different value from &swapper_pg-dir (which is AFAIK fine on a pravirt guest): Putting some debugging messages in early_ioremap_pmd: /* Appears 3 times */ [ 0.000000] *************************** [ 0.000000] __va(%cr3) = c0000000, &swapper_pg_dir = c02cc000 [ 0.000000] *************************** After 8 hours of debugging and staring on lguest code, I noticed something strange in paravirt_ops->set_pmd hypercall invocation: static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); } The first hcall parameter is global pgdir which looks fine. The second parameter is the pmd index in the pgdir which is suspectful. AFAIK, calculating the index of pmd does not need a divisoin over four. Removing the division made lguest work fine again . Patch is at [2]. I am not sure why the division over four existed in the first place. It seems bogus, maybe the Xen patch just made the problem appear ? [2]: The patch: [PATCH] lguest: fix pgdir pmd index cacluation Remove an error in index calculation which leads to removing a not existing shadow page table (leading to a Null dereference). Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar --- arch/x86/lguest/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 1e613fb03e32..cccb38a59653 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -481,7 +481,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, - (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); + (__pa(pmdp)&(PAGE_SIZE-1)), 0); } /* There are a couple of legacy places where the kernel sets a PTE, but we -- cgit v1.2.2 From 92cb54a37a42a41cfb2ef7f1478bfa4395198258 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Feb 2008 14:37:52 +0100 Subject: x86: make DEBUG_PAGEALLOC and CPA more robust Use PF_MEMALLOC to prevent recursive calls in the DBEUG_PAGEALLOC case. This makes the code simpler and more robust against allocation failures. This fixes the following fallback to non-mmconfig: http://lkml.org/lkml/2008/2/20/551 http://bugzilla.kernel.org/show_bug.cgi?id=10083 Also, for DEBUG_PAGEALLOC=n reduce the pool size to one page. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 84 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 464d8fc21ce6..14e48b5a94ba 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -44,6 +44,12 @@ static inline unsigned long highmap_end_pfn(void) #endif +#ifdef CONFIG_DEBUG_PAGEALLOC +# define debug_pagealloc 1 +#else +# define debug_pagealloc 0 +#endif + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -355,45 +361,48 @@ out_unlock: static LIST_HEAD(page_pool); static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed, pool_refill; +static unsigned long pool_used, pool_failed; -static void cpa_fill_pool(void) +static void cpa_fill_pool(struct page **ret) { - struct page *p; gfp_t gfp = GFP_KERNEL; + unsigned long flags; + struct page *p; - /* Do not allocate from interrupt context */ - if (in_irq() || irqs_disabled()) - return; /* - * Check unlocked. I does not matter when we have one more - * page in the pool. The bit lock avoids recursive pool - * allocations: + * Avoid recursion (on debug-pagealloc) and also signal + * our priority to get to these pagetables: */ - if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) + if (current->flags & PF_MEMALLOC) return; + current->flags |= PF_MEMALLOC; -#ifdef CONFIG_DEBUG_PAGEALLOC /* - * We could do: - * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; - * but this fails on !PREEMPT kernels + * Allocate atomically from atomic contexts: */ - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; -#endif + if (in_atomic() || irqs_disabled() || debug_pagealloc) + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - while (pool_pages < pool_size) { + while (pool_pages < pool_size || (ret && !*ret)) { p = alloc_pages(gfp, 0); if (!p) { pool_failed++; break; } - spin_lock_irq(&pgd_lock); + /* + * If the call site needs a page right now, provide it: + */ + if (ret && !*ret) { + *ret = p; + continue; + } + spin_lock_irqsave(&pgd_lock, flags); list_add(&p->lru, &page_pool); pool_pages++; - spin_unlock_irq(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } - clear_bit_unlock(0, &pool_refill); + + current->flags &= ~PF_MEMALLOC; } #define SHIFT_MB (20 - PAGE_SHIFT) @@ -414,11 +423,15 @@ void __init cpa_init(void) * GiB. Shift MiB to Gib and multiply the result by * POOL_PAGES_PER_GB: */ - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; + if (debug_pagealloc) { + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + } else { + pool_size = 1; + } pool_low = pool_size; - cpa_fill_pool(); + cpa_fill_pool(NULL); printk(KERN_DEBUG "CPA: page pool initialized %lu of %lu pages preallocated\n", pool_pages, pool_size); @@ -440,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address) spin_lock_irqsave(&pgd_lock, flags); if (list_empty(&page_pool)) { spin_unlock_irqrestore(&pgd_lock, flags); - return -ENOMEM; + base = NULL; + cpa_fill_pool(&base); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); + } else { + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; } - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - /* * Check for races, another CPU might have split this page * up for us already: @@ -734,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_all(cache); out: - cpa_fill_pool(); + cpa_fill_pool(NULL); + return ret; } @@ -897,7 +915,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * Try to refill the page pool here. We can do this only after * the tlb flush. */ - cpa_fill_pool(); + cpa_fill_pool(NULL); } #ifdef CONFIG_HIBERNATION -- cgit v1.2.2 From b02a7f22f39f02fdf5a1380ff700293639db4490 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 5 Feb 2008 00:48:13 +0100 Subject: x86: hpet fix docbook comment Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 429d084e014d..235fd6c77504 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -368,8 +368,8 @@ static int hpet_clocksource_register(void) return 0; } -/* - * Try to setup the HPET timer +/** + * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { -- cgit v1.2.2 From a7ef94e6889186848573a10c5bdb8271405f44de Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Feb 2008 14:51:00 -0800 Subject: x86: do not promote TM3x00/TM5x00 to i686-class We have been promoting Transmeta TM3x00/TM5x00 chips to i686-class based on the notion that they contain all the user-space visible features of an i686-class chip. However, this is not actually true: they lack the EA-taking long NOPs (0F 1F /0). Since this is a userspace-visible incompatibility, downgrade these CPUs to the manufacturer-defined i586 level. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/transmeta.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 200fb3f9ebfb..e8b422c1c512 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -76,13 +76,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) /* All Transmeta CPUs have a constant TSC */ set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - /* If we can run i686 user-space code, call us an i686 */ -#define USER686 ((1 << X86_FEATURE_TSC)|\ - (1 << X86_FEATURE_CX8)|\ - (1 << X86_FEATURE_CMOV)) - if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686) - c->x86 = 6; - #ifdef CONFIG_SYSCTL /* randomize_va_space slows us down enormously; it probably triggers retranslation of x86->native bytecode */ -- cgit v1.2.2 From 7343b3b3a627eb30e24e921f004f659c8ebb91c5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Feb 2008 14:52:05 -0800 Subject: x86: require family >= 6 if we are using P6 NOPs The P6 family of NOPs are only available on family >= 6 or above, so enforce that in the boot code. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.cpu | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e09a6b73a1aa..86fd2a0e4597 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -377,6 +377,10 @@ config X86_OOSTORE def_bool y depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR +config X86_P6_NOP + def_bool y + depends on (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) + config X86_TSC def_bool y depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 @@ -390,6 +394,7 @@ config X86_CMOV config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 + default "6" if X86_32 && X86_P6_NOP default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) default "3" -- cgit v1.2.2 From 959b3be64cab9160cd74532a49b89cdd918d38e9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Feb 2008 14:56:45 -0800 Subject: x86: don't use P6_NOPs if compiling with CONFIG_X86_GENERIC P6_NOPs are definitely not supported on some VIA CPUs, and possibly (unverified) on AMD K7s. It is also the only thing that prevents a 686 kernel from running on Transmeta TM3x00/5x00 (Crusoe) series. The performance benefit over generic NOPs is very small, so when building for generic consumption, avoid using them. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.cpu | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 86fd2a0e4597..6d50064db182 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -377,9 +377,18 @@ config X86_OOSTORE def_bool y depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR +# +# P6_NOPs are a relatively minor optimization that require a family >= +# 6 processor, except that it is broken on certain VIA chips. +# Furthermore, AMD chips prefer a totally different sequence of NOPs +# (which work on all CPUs). As a result, disallow these if we're +# compiling X86_GENERIC but not X86_64 (these NOPs do work on all +# x86-64 capable chips); the list of processors in the right-hand clause +# are the cores that benefit from this optimization. +# config X86_P6_NOP def_bool y - depends on (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) + depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) config X86_TSC def_bool y -- cgit v1.2.2 From 829157be590af1c2555fb74c3c4db3327e3201fc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 13 Feb 2008 11:16:46 -0800 Subject: x86: handle BIOSes which terminate e820 with CF=1 and no SMAP The proper way to terminate the e820 chain is with %ebx == 0 on the last legitimate memory block. However, several BIOSes don't do that and instead return error (CF = 1) when trying to read off the end of the list. For this error return, %eax doesn't necessarily return the SMAP signature -- correctly so, since %ah should contain an error code in this case. To deal with some particularly broken BIOSes, we clear the entire e820 chain if the SMAP signature is missing in the middle, indicating a plain insane e820 implementation. However, we need to make the test for CF = 1 before the SMAP check. This fixes at least one HP laptop (nc6400) for which none of the memory-probing methods (e820, e801, 88) functioned fully according to spec. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/boot/memory.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 378353956b5d..e77d89f9e8aa 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -37,6 +37,12 @@ static int detect_memory_e820(void) "=m" (*desc) : "D" (desc), "d" (SMAP), "a" (0xe820)); + /* BIOSes which terminate the chain with CF = 1 as opposed + to %ebx = 0 don't always report the SMAP signature on + the final, failing, probe. */ + if (err) + break; + /* Some BIOSes stop returning SMAP in the middle of the search loop. We don't know exactly how the BIOS screwed up the map at that point, we might have a @@ -47,9 +53,6 @@ static int detect_memory_e820(void) break; } - if (err) - break; - count++; desc++; } while (next && count < E820MAX); -- cgit v1.2.2 From f5106d91f2bf9153d6420f9ebb8114f73f9ce66a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 18 Feb 2008 13:10:44 -0800 Subject: x86/mtrr: fix kernel-doc missing notation Fix mtrr kernel-doc warning: Warning(linux-2.6.24-git12//arch/x86/kernel/cpu/mtrr/main.c:677): No description found for parameter 'end_pfn' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b6e136f23d3d..c8fda3eb3d7e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -649,6 +649,7 @@ static __init int amd_special_default_mtrr(void) /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number * * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches -- cgit v1.2.2 From 7265b6f10deba1cbe071cee646b063bda07ecd68 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 19 Feb 2008 11:02:30 +0100 Subject: x86: notsc is ignored on common configurations notsc is ignored in 32-bit kernels if CONFIG_X86_TSC is on.. which is bad, fix it. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 43517e324be8..f14cfd9d1f94 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -28,7 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz); static int __init tsc_setup(char *str) { printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); + "cannot disable TSC completely.\n"); + mark_tsc_unstable("user disabled TSC"); return 1; } #else -- cgit v1.2.2 From 3b57bc461fd5019aef4cfc77d4faf56ebe95449c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Feb 2008 18:53:17 -0800 Subject: x86: remove double-checking empty zero pages debug so far no one complained about that. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bb652f5a93fb..6dbb0035c57a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -515,14 +515,6 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - /* temporary debugging - double check it's true: */ - { - int i; - - for (i = 0; i < 1024; i++) - WARN_ON_ONCE(empty_zero_page[i]); - } - reservedpages = 0; /* this will put all low memory onto the freelists */ -- cgit v1.2.2 From 88f3aec7afd9ae3e6f6d221801996b69aad1e3a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Feb 2008 11:04:11 +0100 Subject: x86: fix spontaneous reboot with allyesconfig bzImage recently the 64-bit allyesconfig bzImage kernel started spontaneously rebooting during early bootup. after a few fun hours spent with early init debugging, it turns out that we've got this rather annoying limit on the size of the kernel image: #define KERNEL_TEXT_SIZE (40*1024*1024) which limit my vmlinux just happened to pass: text data bss dec hex filename 29703744 4222751 8646224 42572719 2899baf vmlinux 40 MB is 42572719 bytes, so my vmlinux was just 1.5% above this limit :-/ So it happily crashed right in head_64.S, which - as we all know - is the most debuggable code in the whole architecture ;-) So increase the limit to allow an up to 128MB kernel image to be mapped. (should anyone be that crazy or lazy) We have a full 4K of pagetable (level2_kernel_pgt) allocated for these mappings already, so there's no RAM overhead and the limit was rather pointless and arbitrary. Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 22 ++++++++++++++-------- arch/x86/mm/init_64.c | 5 +++-- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index eb415043a929..b037b15be7f8 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -379,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) - /* 40MB kernel mapping. The kernel code cannot be bigger than that. - When you change this change KERNEL_TEXT_SIZE in page.h too. */ - /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) - /* Module mapping starts here */ - .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 + /* + * 128 MB kernel mapping. We spend a full page on this pagetable + * anyway. + * + * The kernel code+data+bss must not be bigger than that. + * + * (NOTE: at +128MB starts the module area, see MODULES_VADDR. + * If you want to increase this then increase MODULES_VADDR + * too.) + */ + PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, + KERNEL_TEXT_SIZE/PMD_SIZE) NEXT_PAGE(level2_spare_pgt) - .fill 512,8,0 + .fill 512, 8, 0 #undef PMDS #undef NEXT_PAGE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6dbb0035c57a..a02a14f0f324 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -172,8 +172,9 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) } /* - * The head.S code sets up the kernel high mapping from: - * __START_KERNEL_map to __START_KERNEL_map + KERNEL_TEXT_SIZE + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * * phys_addr holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up -- cgit v1.2.2 From d4afe414189b098d56bcd24280c018aa2ac9a990 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Feb 2008 13:39:30 +0100 Subject: x86: rename KERNEL_TEXT_SIZE => KERNEL_IMAGE_SIZE The KERNEL_TEXT_SIZE constant was mis-named, as we not only map the kernel text but data, bss and init sections as well. That name led me on the wrong path with the KERNEL_TEXT_SIZE regression, because i knew how big of _text_ my images have and i knew about the 40 MB "text" limit so i wrongly thought to be on the safe side of the 40 MB limit with my 29 MB of text, while the total image size was slightly above 40 MB. Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b037b15be7f8..a007454133a3 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -393,7 +393,7 @@ NEXT_PAGE(level2_kernel_pgt) * too.) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, - KERNEL_TEXT_SIZE/PMD_SIZE) + KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_spare_pgt) .fill 512, 8, 0 -- cgit v1.2.2 From ce28b9864b853803320c3f1d8de1b81aa4120b14 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Feb 2008 23:57:30 +0100 Subject: x86: fix vsyscall wreckage based on a report from Arne Georg Gleditsch about user-space apps misbehaving after toggling /proc/sys/kernel/vsyscall64, a review of the code revealed that the "NOP patching" done there is fundamentally unsafe for a number of reasons: 1) the patching code runs without synchronizing other CPUs 2) it inserts NOPs even if there is no clock source which provides vread 3) when the clock source changes to one without vread we run in exactly the same problem as in #2 4) if nobody toggles the proc entry from 1 to 0 and to 1 again, then the syscall is not patched out as a result it is possible to break user-space via this patching. The only safe thing for now is to remove the patching. This code was broken since v2.6.21. Reported-by: Arne Georg Gleditsch Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 52 +++---------------------------------------- 1 file changed, 3 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3f8242774580..b6be812fac05 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -44,11 +44,6 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","cx","memory" -#define __pa_vsymbol(x) \ - ({unsigned long v; \ - extern char __vsyscall_0; \ - asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) /* * vsyscall_gtod_data contains data that is : @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(struct timezone * tz) static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) { int ret; - asm volatile("vsysc2: syscall" + asm volatile("syscall" : "=a" (ret) : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); @@ -112,7 +107,7 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) static __always_inline long time_syscall(long *t) { long secs; - asm volatile("vsysc1: syscall" + asm volatile("syscall" : "=a" (secs) : "0" (__NR_time),"D" (t) : __syscall_clobber); return secs; @@ -227,50 +222,10 @@ long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -#define SYSCALL 0x050f -#define NOP2 0x9090 - -/* - * NOP out syscall in vsyscall page when not needed. - */ -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - extern u16 vsysc1, vsysc2; - u16 __iomem *map1; - u16 __iomem *map2; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (!write) - return ret; - /* gcc has some trouble with __va(__pa()), so just do it this - way. */ - map1 = ioremap(__pa_vsymbol(&vsysc1), 2); - if (!map1) - return -ENOMEM; - map2 = ioremap(__pa_vsymbol(&vsysc2), 2); - if (!map2) { - ret = -ENOMEM; - goto out; - } - if (!vsyscall_gtod_data.sysctl_enabled) { - writew(SYSCALL, map1); - writew(SYSCALL, map2); - } else { - writew(NOP2, map1); - writew(NOP2, map2); - } - iounmap(map2); -out: - iounmap(map1); - return ret; -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .mode = 0644 }, {} }; @@ -279,7 +234,6 @@ static ctl_table kernel_root_table2[] = { .child = kernel_table2 }, {} }; - #endif /* Assume __initcall executes before all user space. Hopefully kmod -- cgit v1.2.2 From 5d119b2c9a490e2d647eae134211b32a18a04c7d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 Feb 2008 12:55:57 +0100 Subject: x86: fix execve with -fstack-protect pointed out by pageexec@freemail.hu: > what happens here is that gcc treats the argument area as owned by the > callee, not the caller and is allowed to do certain tricks. for ssp it > will make a copy of the struct passed by value into the local variable > area and pass *its* address down, and it won't copy it back into the > original instance stored in the argument area. > > so once sys_execve returns, the pt_regs passed by value hasn't at all > changed and its default content will cause a nice double fault (FWIW, > this part took me the longest to debug, being down with cold didn't > help it either ;). To fix this we pass in pt_regs by pointer. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_64.S | 6 ++++-- arch/x86/kernel/process_64.c | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2ad9a1bc6a73..c20c9e7e08dd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -453,6 +453,7 @@ ENTRY(stub_execve) CFI_REGISTER rip, r11 SAVE_REST FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx call sys_execve RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) @@ -1036,15 +1037,16 @@ ENDPROC(child_rip) * rdi: name, rsi: argv, rdx: envp * * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) * * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack */ ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 SAVE_ALL + movq %rsp,%rcx call sys_execve movq %rax, RAX(%rsp) RESTORE_REST diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b0cc8f0136d8..43f287744f9f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -730,16 +730,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ asmlinkage long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) + char __user * __user *envp, struct pt_regs *regs) { long error; char * filename; filename = getname(name); error = PTR_ERR(filename); - if (IS_ERR(filename)) + if (IS_ERR(filename)) return error; - error = do_execve(filename, argv, envp, ®s); + error = do_execve(filename, argv, envp, regs); putname(filename); return error; } -- cgit v1.2.2 From 4147c8747eace9058c606b35e700060297edaf91 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 21 Feb 2008 15:50:14 +0100 Subject: x86: don't print a warning when MTRR are blank and running in KVM Inside a KVM virtual machine the MTRRs are usually blank. This confuses Linux and causes a warning message at boot. This patch removes that warning message when running Linux as a KVM guest. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c8fda3eb3d7e..be83336fddba 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "mtrr.h" u32 num_var_ranges = 0; @@ -689,8 +690,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); + if (!kvm_para_available()) { + printk(KERN_WARNING + "WARNING: strange, CPU MTRRs all blank?\n"); + WARN_ON(1); + } return 0; } -- cgit v1.2.2 From ed2b7e2b1d1ae201afe8fbd111632074b7b53ed4 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 22 Feb 2008 21:58:37 +0200 Subject: x86: don't make swapper_pg_pmd global There doesn't seem to be any reason for swapper_pg_pmd being global. Signed-off-by: Adrian Bunk Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 25eb98540a41..fd8ca53943a8 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -606,7 +606,7 @@ ENTRY(_stext) .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -ENTRY(swapper_pg_pmd) +swapper_pg_pmd: .fill 1024*KPMDS,4,0 #else ENTRY(swapper_pg_dir) -- cgit v1.2.2 From 1650743cdc0db73478f72c57544ce79ea8f3dda6 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 22 Feb 2008 19:23:58 +0100 Subject: x86: don't save unreliable stack trace entries Currently, there is no way for print_stack_trace() to determine whether a given stack trace entry was deemed reliable or not, simply because save_stack_trace() does not record this information. (Perhaps needless to say, this makes the saved stack traces A LOT harder to read, and probably with no other benefits, since debugging features that use save_stack_trace() most likely also require frame pointers, etc.) This patch reverts to the old behaviour of only recording the reliable trace entries for saved stack traces. Signed-off-by: Vegard Nossum Acked-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/stacktrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 02f0f61f5b11..c28c342c162f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -25,6 +25,8 @@ static int save_stack_stack(void *data, char *name) static void save_stack_address(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = data; + if (!reliable) + return; if (trace->skip > 0) { trace->skip--; return; @@ -37,6 +39,8 @@ static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = (struct stack_trace *)data; + if (!reliable) + return; if (in_sched_functions(addr)) return; if (trace->skip > 0) { -- cgit v1.2.2 From 2b775a27c0d9fdf8078d5b31e1e27411e5bf2a91 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 22 Feb 2008 12:09:29 -0300 Subject: x86: make c_idle.work have a static address. Currently, c_idle is declared in the stack, and thus, have no static address. Peter Zijlstra points out this simple solution, in which c_idle.work is initializated separatedly. Note that the INIT_WORK macro has a static declaration of a key inside. Signed-off-by: Glauber Costa Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index d53bd6fcb428..0880f2c388a9 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -554,10 +554,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) int timeout; unsigned long start_rip; struct create_idle c_idle = { - .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), .cpu = cpu, .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; + INIT_WORK(&c_idle.work, do_fork_idle); /* allocate memory for gdts of secondary cpus. Hotplug is considered */ if (!cpu_gdt_descr[cpu].address && -- cgit v1.2.2 From 03994f01e8b72b3d01fd3d09d1cc7c9f421a727c Mon Sep 17 00:00:00 2001 From: Priit Laes Date: Sun, 24 Feb 2008 18:36:05 +0200 Subject: x86: fix build on non-C locales. For some locales regex range [a-zA-Z] does not work as it is supposed to. so we have to use [:alnum:] and [:xdigit:] to make it work as intended. [1] http://en.wikipedia.org/wiki/Estonian_alphabet Signed-off-by: Ingo Molnar --- arch/x86/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index f385a4b4a484..b8bd0c4aa02e 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -48,7 +48,7 @@ obj-$(VDSO64-y) += vdso-syms.lds # Match symbols in the DSO that look like VDSO*; produce a file of constants. # sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' + -e 's/^\([[:xdigit:]]*\) . \(VDSO[[:alnum:]_]*\)$$/\2 = 0x\1;/p' quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ -- cgit v1.2.2 From 12c247a6719987aad65f83158d2bb3e73c75c1f5 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Sun, 24 Feb 2008 18:27:03 +0100 Subject: x86: fix boot failure on 486 due to TSC breakage > Diffing dmesg between git7 and git8 doesn't sched any light since > git8 also removed the printouts of the x86 caps as they were being > initialised and updated. I'm currently adding those printouts back > in the hope of seeing where and when the caps get broken. That turned out to be very illuminating: --- dmesg-2.6.24-git7 2008-02-24 18:01:25.295851000 +0100 +++ dmesg-2.6.24-git8 2008-02-24 18:01:25.530358000 +0100 ... CPU: After generic identify, caps: 00000003 00000000 00000000 00000000 00000000 00000000 00000000 00000000 CPU: After all inits, caps: 00000003 00000000 00000000 00000000 00000000 00000000 00000000 00000000 +CPU: After applying cleared_cpu_caps, caps: 00000013 00000000 00000000 00000000 00000000 00000000 00000000 00000000 Notice how the TSC cap bit goes from Off to On. (The first two lines are printout loops from -git7 forward-ported to -git8, the third line is the same printout loop added just after the xor-with-cleared_cpu_caps[] loop.) Here's how the breakage occurs: 1. arch/x86/kernel/tsc_32.c:tsc_init() sees !cpu_has_tsc, so bails and calls setup_clear_cpu_cap(X86_FEATURE_TSC). 2. include/asm-x86/cpufeature.h:setup_clear_cpu_cap(bit) clears the bit in boot_cpu_data and sets it in cleared_cpu_caps 3. arch/x86/kernel/cpu/common.c:identify_cpu() XORs all caps in with cleared_cpu_caps HOWEVER, at this point c->x86_capability correctly has TSC Off, cleared_cpu_caps has TSC On, so the XOR incorrectly sets TSC to On in c->x86_capability, with disastrous results. The real bug is that clearing bits with XOR only works if the bits are known to be 1 prior to the XOR, and that's not true here. A simple fix is to convert the XOR to AND-NOT instead. The following patch does that, and allows my 486 to boot 2.6.25-rc kernels again. [ mingo@elte.hu: fixed a similar bug in setup_64.c as well. ] The breakage was introduced via commit 7d851c8d3db0. Signed-off-by: Mikael Pettersson Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/setup_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f86a3c4a2669..a38aafaefc23 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -504,7 +504,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Clear all flags overriden by options */ for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] ^= cleared_cpu_caps[i]; + c->x86_capability[i] &= ~cleared_cpu_caps[i]; /* Init Machine Check Exception if available. */ mcheck_init(c); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6fd804f07821..7637dc91c79b 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -1021,7 +1021,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Clear all flags overriden by options */ for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] ^= cleared_cpu_caps[i]; + c->x86_capability[i] &= ~cleared_cpu_caps[i]; #ifdef CONFIG_X86_MCE mcheck_init(c); -- cgit v1.2.2 From 3d00daf44654dc75629caf42816ac4e293658724 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 26 Feb 2008 13:00:18 -0800 Subject: x86: tls prevent_tail_call Fix a kernel bug (vmware boot problem) reported by Tomasz Grobelny, which occurs with certain .config variants and gccs. The x86 TLS cleanup in commit efd1ca52d04d2f6df337a3332cee56cd60e6d4c4 made the sys_set_thread_area and sys_get_thread_area functions ripe for tail call optimization. If the compiler chooses to use it for them, it can clobber the user trap frame because these are asmlinkage functions. Reported-by: Tomasz Grobelny Signed-off-by: Roland McGrath Signed-off-by: Ingo Molnar --- arch/x86/kernel/tls.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6dfd4e76661a..022bcaa3b42e 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -91,7 +91,9 @@ int do_set_thread_area(struct task_struct *p, int idx, asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) { - return do_set_thread_area(current, -1, u_info, 1); + int ret = do_set_thread_area(current, -1, u_info, 1); + prevent_tail_call(ret); + return ret; } @@ -139,7 +141,9 @@ int do_get_thread_area(struct task_struct *p, int idx, asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) { - return do_get_thread_area(current, -1, u_info); + int ret = do_get_thread_area(current, -1, u_info); + prevent_tail_call(ret); + return ret; } int regset_tls_active(struct task_struct *target, -- cgit v1.2.2 From d67bbacb4b557ece3b41abdcb616354ac0ce00e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Feb 2008 09:39:52 +0100 Subject: x86: restore vsyscall64 prochandler a recent fix: commit ce28b9864b853803320c3f1d8de1b81aa4120b14 Author: Thomas Gleixner Date: Wed Feb 20 23:57:30 2008 +0100 x86: fix vsyscall wreckage removed the broken /kernel/vsyscall64 handler completely. This triggers the following debug check: sysctl table check failed: /kernel/vsyscall64 No proc_handler Restore the sane part of the proc handler. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b6be812fac05..edff4c985485 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -222,10 +222,19 @@ long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL + +static int +vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); +} + static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644 }, + .mode = 0644, + .proc_handler = vsyscall_sysctl_change }, {} }; -- cgit v1.2.2 From f2dbe03dccc95f41429d60e4221b02fc0f112cc4 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 27 Feb 2008 11:42:15 -0800 Subject: x86 vdso: fix build locale dependency Priit Laes discovered that the sed command processing nm output was sensitive to locale settings. This was addressed in commit 03994f01e8b72b3d01fd3d09d1cc7c9f421a727c by using [:alnum:] in place of [a-zA-Z0-9]. But that solution too is locale-dependent and may not always match the identifiers it needs to. The better fix is just to run sed et al with a fixed locale setting in all builds. Signed-off-by: Roland McGrath CC: Priit Laes Signed-off-by: Ingo Molnar --- arch/x86/vdso/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index b8bd0c4aa02e..0a8f4742ef51 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -48,9 +48,11 @@ obj-$(VDSO64-y) += vdso-syms.lds # Match symbols in the DSO that look like VDSO*; produce a file of constants. # sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([[:xdigit:]]*\) . \(VDSO[[:alnum:]_]*\)$$/\2 = 0x\1;/p' + -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' quiet_cmd_vdsosym = VDSOSYM $@ - cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ +define cmd_vdsosym + $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ +endef $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE $(call if_changed,vdsosym) -- cgit v1.2.2 From b16bf712f491808a8c926dd481c696fe7d73ee5a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Feb 2008 14:02:08 +0100 Subject: x86: fix leak un ioremap_page_range() failure Jan Beulich noticed it during code review that if a driver's ioremap() fails (say due to -ENOMEM) then we might leak the struct vm_area. Free it properly. Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 882328efc3db..ac3c959e271d 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -162,7 +162,7 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { - remove_vm_area((void *)(vaddr & PAGE_MASK)); + free_vm_area(area); return NULL; } -- cgit v1.2.2 From 757265b8c57bb8fd91785d3d1a87fb483c86c9c2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Feb 2008 20:19:06 +0100 Subject: x86: delay the export removal of init_mm delay the removal of this symbol export by one more kernel release, giving external modules such as VirtualBox a chance to stop using it. Signed-off-by: Ingo Molnar --- arch/x86/kernel/init_task.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 5b3ce7934363..3d01e47777db 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -15,6 +15,7 @@ static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ /* * Initial thread structure. -- cgit v1.2.2 From 8be8f54bae3453588011cad06363813a5293af53 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Feb 2008 20:43:21 +0100 Subject: x86: CPA: avoid split of alias mappings avoid over-eager large page splitup. When the target area needs to be split or is split already (ioremap) then the current code enforces the split of large mappings in the alias regions even if we could avoid it. Use a separate variable processed in the cpa_data structure to carry the number of pages which have been processed instead of reusing the numpages variable. This keeps numpages intact and gives the alias code a chance to keep large mappings intact. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 14e48b5a94ba..7049294fb469 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -26,6 +26,7 @@ struct cpa_data { pgprot_t mask_set; pgprot_t mask_clr; int numpages; + int processed; int flushtlb; unsigned long pfn; }; @@ -290,8 +291,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ nextpage_addr = (address + psize) & pmask; numpages = (nextpage_addr - address) >> PAGE_SHIFT; - if (numpages < cpa->numpages) - cpa->numpages = numpages; + if (numpages < cpa->processed) + cpa->processed = numpages; /* * We are safe now. Check whether the new pgprot is the same: @@ -318,7 +319,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ addr = address + PAGE_SIZE; pfn++; - for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { + for (i = 1; i < cpa->processed; i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(new_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) @@ -342,7 +343,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * that we limited the number of possible pages already to * the number of pages in the large page. */ - if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + if (address == (nextpage_addr - psize) && cpa->processed == numpages) { /* * The address is aligned and the number of pages * covers the full page. @@ -572,7 +573,7 @@ repeat: set_pte_atomic(kpte, new_pte); cpa->flushtlb = 1; } - cpa->numpages = 1; + cpa->processed = 1; return 0; } @@ -583,7 +584,7 @@ repeat: do_split = try_preserve_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, - * return. cp->numpages and cpa->tlbflush have been updated in + * return. cp->processed and cpa->tlbflush have been updated in * try_large_page: */ if (do_split <= 0) @@ -662,7 +663,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * Store the remaining nr of pages for the large page * preservation check. */ - cpa->numpages = numpages; + cpa->numpages = cpa->processed = numpages; ret = __change_page_attr(cpa, checkalias); if (ret) @@ -679,9 +680,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * CPA operation. Either a large page has been * preserved or a single page update happened. */ - BUG_ON(cpa->numpages > numpages); - numpages -= cpa->numpages; - cpa->vaddr += cpa->numpages * PAGE_SIZE; + BUG_ON(cpa->processed > numpages); + numpages -= cpa->processed; + cpa->vaddr += cpa->processed * PAGE_SIZE; } return 0; } -- cgit v1.2.2 From b4ef95de00be4c2c30feccf607a45093c8c118b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 Feb 2008 09:40:27 +0100 Subject: x86: disable BTS ptrace extensions for now revert the BTS ptrace extension for now. based on general objections from Roland McGrath: http://lkml.org/lkml/2008/2/21/323 we'll let the BTS functionality cook some more and re-enable it in v2.6.26. We'll leave the dead code around to help the development of this code. (X86_BTS is not defined at the moment) Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 2 ++ arch/x86/kernel/process_64.c | 2 ++ arch/x86/kernel/ptrace.c | 12 ++++++++++++ 3 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a7d50a547dc2..be3c7a299f02 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -603,11 +603,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, } #endif +#ifdef X86_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); +#endif if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 43f287744f9f..3baf9b9f4c87 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -604,11 +604,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +#ifdef X86_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); +#endif } /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d862e396b099..f41fdc98efb1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -544,6 +544,8 @@ static int ptrace_set_debugreg(struct task_struct *child, return 0; } +#ifdef X86_BTS + static int ptrace_bts_get_size(struct task_struct *child) { if (!child->thread.ds_area_msr) @@ -826,6 +828,7 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk, ptrace_bts_write_record(tsk, &rec); } +#endif /* X86_BTS */ /* * Called by kernel/ptrace.c when detaching.. @@ -839,7 +842,9 @@ void ptrace_disable(struct task_struct *child) clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif if (child->thread.ds_area_msr) { +#ifdef X86_BTS ptrace_bts_realloc(child, 0, 0); +#endif child->thread.debugctlmsr &= ~ds_debugctl_mask(); if (!child->thread.debugctlmsr) clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); @@ -961,6 +966,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; #endif + /* + * These bits need more cooking - not enabled yet: + */ +#ifdef X86_BTS case PTRACE_BTS_CONFIG: ret = ptrace_bts_config (child, data, (struct ptrace_bts_config __user *)addr); @@ -988,6 +997,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) ret = ptrace_bts_drain (child, data, (struct bts_struct __user *) addr); break; +#endif default: ret = ptrace_request(child, request, addr, data); @@ -1226,12 +1236,14 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) case PTRACE_SETOPTIONS: case PTRACE_SET_THREAD_AREA: case PTRACE_GET_THREAD_AREA: +#ifdef X86_BTS case PTRACE_BTS_CONFIG: case PTRACE_BTS_STATUS: case PTRACE_BTS_SIZE: case PTRACE_BTS_GET: case PTRACE_BTS_CLEAR: case PTRACE_BTS_DRAIN: +#endif return sys_ptrace(request, pid, addr, data); default: -- cgit v1.2.2 From d40e705903397445c6861a0a56c23e5b2e8f9b9a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 29 Feb 2008 18:55:43 +0100 Subject: xen: mask out SEP from CPUID Fix 32-on-64 pvops kernel: we don't want userspace using syscall/sysenter, even if the hypervisor supports it, so mask it out from CPUID. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 49e5358f481a..8b9ee27805fd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -153,6 +153,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ + (1 << X86_FEATURE_SEP) | /* disable SEP */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" -- cgit v1.2.2 From 334df50a866ff7e234c9566960997ca5b9d0a382 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Jan 2008 13:09:33 +0100 Subject: KVM: SVM: Fix lazy FPU switching If the guest writes to cr0 and leaves the TS flag at 0 while vcpu->fpu_active is also 0, the TS flag in the guest's cr0 gets lost. This leads to corrupt FPU state an causes Windows Vista 64bit to crash very soon after boot. This patch fixes this bug. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index de755cb1431d..511628916c4b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -792,6 +792,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + if (!vcpu->fpu_active) + cr0 |= X86_CR0_TS; svm->vmcb->save.cr0 = cr0; } -- cgit v1.2.2 From 6b390b6392309b98fd116b57c2926c44975cde26 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 29 Jan 2008 13:01:27 +0100 Subject: KVM: SVM: set NM intercept when enabling CR0.TS in the guest Explicitly enable the NM intercept in svm_set_cr0 if we enable TS in the guest copy of CR0 for lazy FPU switching. This fixes guest SMP with Linux under SVM. Without that patch Linux deadlocks or panics right after trying to boot the other CPUs. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 511628916c4b..d71daabbb51b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -792,8 +792,10 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; cr0 &= ~(X86_CR0_CD | X86_CR0_NW); - if (!vcpu->fpu_active) + if (!vcpu->fpu_active) { + svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); cr0 |= X86_CR0_TS; + } svm->vmcb->save.cr0 = cr0; } -- cgit v1.2.2 From d730616384211436cfc84e6c2c1aa45351706a96 Mon Sep 17 00:00:00 2001 From: Paul Knowles Date: Wed, 6 Feb 2008 11:02:35 +0000 Subject: KVM: Fix kvm_arch_vcpu_ioctl_set_sregs so that set_cr0 works properly Whilst working on getting a VM to initialize in to IA32e mode I found this issue. set_cr0 relies on comparing the old cr0 to the new one to work correctly. Move the assignment below so the compare can work. Signed-off-by: Paul Knowles Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf5308148689..ec60409299a3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2861,8 +2861,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->decache_cr4_guest_bits(vcpu); mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; - vcpu->arch.cr0 = sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); -- cgit v1.2.2 From 674eea0fc4d1d693250b5d3ddad42ca931c87dfd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 11 Feb 2008 18:37:23 +0200 Subject: KVM: Make the supported cpuid list a host property rather than a vm property One of the use cases for the supported cpuid list is to create a "greatest common denominator" of cpu capabilities in a server farm. As such, it is useful to be able to get the list without creating a virtual machine first. Since the code does not depend on the vm in any way, all that is needed is to move it to the device ioctl handler. The capability identifier is also changed so that binaries made against -rc1 will fail gracefully. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec60409299a3..a7069ec2267c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -46,6 +46,9 @@ #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); + struct kvm_x86_ops *kvm_x86_ops; struct kvm_stats_debugfs_item debugfs_entries[] = { @@ -727,6 +730,24 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } + case KVM_GET_SUPPORTED_CPUID: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, + cpuid_arg->entries); + if (r) + goto out; + + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + goto out; + r = 0; + break; + } default: r = -EINVAL; } @@ -974,8 +995,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, put_cpu(); } -static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, - struct kvm_cpuid2 *cpuid, +static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { struct kvm_cpuid_entry2 *cpuid_entries; @@ -1487,24 +1507,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } - case KVM_GET_SUPPORTED_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; - } default: ; } -- cgit v1.2.2 From c7ac679c160db864810920df61a6ed14275011aa Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 11 Feb 2008 20:28:27 +0100 Subject: KVM: emulate access to MSR_IA32_MCG_CTL Injecting an GP when accessing this MSR lets Windows crash when running some stress test tools in KVM. So this patch emulates access to this MSR. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7069ec2267c..338764fa5391 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -487,6 +487,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", __FUNCTION__, data); break; + case MSR_IA32_MCG_CTL: + pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", + __FUNCTION__, data); + break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: case 0x200 ... 0x2ff: /* MTRRs */ @@ -529,6 +533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: case MSR_IA32_MC0_MISC: case MSR_IA32_MC0_MISC+4: case MSR_IA32_MC0_MISC+8: -- cgit v1.2.2 From 9b5cf48b06a52c04b85c88642c3b620db8e1d592 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 3 Mar 2008 01:17:37 +0100 Subject: x86: revert "x86: CPA: avoid split of alias mappings" Revert: commit 8be8f54bae3453588011cad06363813a5293af53 Author: Thomas Gleixner Date: Sat Feb 23 20:43:21 2008 +0100 x86: CPA: avoid split of alias mappings because it clearly mishandles the case when __change_page_attr(), called from __change_page_attr_set_clr(), changes cpa->processed to 1 and cpa_process_alias(cpa) is executed right after that. This crashes my x86-64 test box early in the boot process (ref. http://bugzilla.kernel.org/show_bug.cgi?id=10140#c4). Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7049294fb469..14e48b5a94ba 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -26,7 +26,6 @@ struct cpa_data { pgprot_t mask_set; pgprot_t mask_clr; int numpages; - int processed; int flushtlb; unsigned long pfn; }; @@ -291,8 +290,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ nextpage_addr = (address + psize) & pmask; numpages = (nextpage_addr - address) >> PAGE_SHIFT; - if (numpages < cpa->processed) - cpa->processed = numpages; + if (numpages < cpa->numpages) + cpa->numpages = numpages; /* * We are safe now. Check whether the new pgprot is the same: @@ -319,7 +318,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ addr = address + PAGE_SIZE; pfn++; - for (i = 1; i < cpa->processed; i++, addr += PAGE_SIZE, pfn++) { + for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(new_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) @@ -343,7 +342,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * that we limited the number of possible pages already to * the number of pages in the large page. */ - if (address == (nextpage_addr - psize) && cpa->processed == numpages) { + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { /* * The address is aligned and the number of pages * covers the full page. @@ -573,7 +572,7 @@ repeat: set_pte_atomic(kpte, new_pte); cpa->flushtlb = 1; } - cpa->processed = 1; + cpa->numpages = 1; return 0; } @@ -584,7 +583,7 @@ repeat: do_split = try_preserve_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, - * return. cp->processed and cpa->tlbflush have been updated in + * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page: */ if (do_split <= 0) @@ -663,7 +662,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * Store the remaining nr of pages for the large page * preservation check. */ - cpa->numpages = cpa->processed = numpages; + cpa->numpages = numpages; ret = __change_page_attr(cpa, checkalias); if (ret) @@ -680,9 +679,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * CPA operation. Either a large page has been * preserved or a single page update happened. */ - BUG_ON(cpa->processed > numpages); - numpages -= cpa->processed; - cpa->vaddr += cpa->processed * PAGE_SIZE; + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; + cpa->vaddr += cpa->numpages * PAGE_SIZE; } return 0; } -- cgit v1.2.2 From 72dc67a69690288538142df73a7e3ac66fea68dc Mon Sep 17 00:00:00 2001 From: Izik Eidus Date: Sun, 10 Feb 2008 18:04:15 +0200 Subject: KVM: remove the usage of the mmap_sem for the protection of the memory slots. This patch replaces the mmap_sem lock for the memory slots with a new kvm private lock, it is needed beacuse untill now there were cases where kvm accesses user memory while holding the mmap semaphore. Signed-off-by: Izik Eidus Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 24 ++++++++++++++--- arch/x86/kvm/paging_tmpl.h | 13 +++++++--- arch/x86/kvm/vmx.c | 7 +++-- arch/x86/kvm/x86.c | 65 ++++++++++++++++++++++++++-------------------- 4 files changed, 71 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8efdcdbebb03..26037106ad19 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -876,11 +876,18 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { + struct page *page; + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); if (gpa == UNMAPPED_GVA) return NULL; - return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + + return page; } static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, @@ -1020,15 +1027,18 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) struct page *page; + down_read(&vcpu->kvm->slots_lock); + down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gfn); + up_read(¤t->mm->mmap_sem); spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __nonpaging_map(vcpu, v, write, gfn, page); spin_unlock(&vcpu->kvm->mmu_lock); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return r; } @@ -1362,6 +1372,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn; int r; u64 gpte = 0; + struct page *page; if (bytes != 4 && bytes != 8) return; @@ -1389,6 +1400,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!is_present_pte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, gfn); + up_read(¤t->mm->mmap_sem); + vcpu->arch.update_pte.gfn = gfn; vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn); } @@ -1496,9 +1512,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 03ba8608fe0f..2009c6e9dc4d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -91,7 +91,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, pt_element_t *table; struct page *page; + down_read(¤t->mm->mmap_sem); page = gfn_to_page(kvm, table_gfn); + up_read(¤t->mm->mmap_sem); + table = kmap_atomic(page, KM_USER0); ret = CMPXCHG(&table[index], orig_pte, new_pte); @@ -378,7 +381,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (r) return r; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); /* * Look up the shadow pte for the faulting address. */ @@ -392,11 +395,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pgprintk("%s: guest page fault\n", __FUNCTION__); inject_page_fault(vcpu, addr, walker.error_code); vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return 0; } + down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, walker.gfn); + up_read(¤t->mm->mmap_sem); spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); @@ -413,14 +418,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, */ if (shadow_pte && is_io_pte(*shadow_pte)) { spin_unlock(&vcpu->kvm->mmu_lock); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return 1; } ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); spin_unlock(&vcpu->kvm->mmu_lock); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return write_pt; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad36447e696e..86f5bf121838 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1477,7 +1477,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(¤t->mm->mmap_sem); + down_write(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -1487,9 +1487,12 @@ static int alloc_apic_access_page(struct kvm *kvm) r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); if (r) goto out; + + down_read(¤t->mm->mmap_sem); kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); + up_read(¤t->mm->mmap_sem); out: - up_write(¤t->mm->mmap_sem); + up_write(&kvm->slots_lock); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 338764fa5391..6b01552bd1f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -184,7 +184,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) int ret; u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, offset * sizeof(u64), sizeof(pdpte)); if (ret < 0) { @@ -201,7 +201,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); out: - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return ret; } @@ -215,13 +215,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; out: - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return changed; } @@ -359,7 +359,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would @@ -375,7 +375,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; vcpu->arch.mmu.new_cr3(vcpu); } - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); } EXPORT_SYMBOL_GPL(set_cr3); @@ -1232,12 +1232,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(¤t->mm->mmap_sem); + down_write(&kvm->slots_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - up_write(¤t->mm->mmap_sem); + up_write(&kvm->slots_lock); return 0; } @@ -1286,7 +1286,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - down_write(¤t->mm->mmap_sem); + down_write(&kvm->slots_lock); p = &kvm->arch.aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; @@ -1300,7 +1300,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, kvm_mmu_zap_all(kvm); - up_write(¤t->mm->mmap_sem); + up_write(&kvm->slots_lock); return 0; @@ -1376,7 +1376,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int is_dirty = 0; - down_write(¤t->mm->mmap_sem); + down_write(&kvm->slots_lock); r = kvm_get_dirty_log(kvm, log, &is_dirty); if (r) @@ -1392,7 +1392,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, } r = 0; out: - up_write(¤t->mm->mmap_sem); + up_write(&kvm->slots_lock); return r; } @@ -1570,7 +1570,7 @@ int emulator_read_std(unsigned long addr, void *data = val; int r = X86EMUL_CONTINUE; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); while (bytes) { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr & (PAGE_SIZE-1); @@ -1592,7 +1592,7 @@ int emulator_read_std(unsigned long addr, addr += tocopy; } out: - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return r; } EXPORT_SYMBOL_GPL(emulator_read_std); @@ -1611,9 +1611,9 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -1651,14 +1651,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { int ret; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); if (ret < 0) { - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return 0; } kvm_mmu_pte_write(vcpu, gpa, val, bytes); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); return 1; } @@ -1670,9 +1670,9 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_io_device *mmio_dev; gpa_t gpa; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); if (gpa == UNMAPPED_GVA) { kvm_inject_page_fault(vcpu, addr, 2); @@ -1749,7 +1749,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, char *kaddr; u64 val; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); if (gpa == UNMAPPED_GVA || @@ -1760,13 +1760,17 @@ static int emulator_cmpxchg_emulated(unsigned long addr, goto emul_write; val = *(u64 *)new; + + down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + kaddr = kmap_atomic(page, KM_USER0); set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); kunmap_atomic(kaddr, KM_USER0); kvm_release_page_dirty(page); emul_write: - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); } #endif @@ -2159,10 +2163,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_x86_ops->skip_emulated_instruction(vcpu); for (i = 0; i < nr_pages; ++i) { - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); page = gva_to_page(vcpu, address + i * PAGE_SIZE); vcpu->arch.pio.guest_pages[i] = page; - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); if (!page) { kvm_inject_gp(vcpu, 0); free_pio_guest_pages(vcpu); @@ -2485,8 +2489,9 @@ static void vapic_enter(struct kvm_vcpu *vcpu) down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - vcpu->arch.apic->vapic_page = page; up_read(¤t->mm->mmap_sem); + + vcpu->arch.apic->vapic_page = page; } static void vapic_exit(struct kvm_vcpu *vcpu) @@ -2959,9 +2964,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; vcpu_load(vcpu); - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -3234,11 +3239,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm, */ if (!user_alloc) { if (npages && !old.rmap) { + down_write(¤t->mm->mmap_sem); memslot->userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); + up_write(¤t->mm->mmap_sem); if (IS_ERR((void *)memslot->userspace_addr)) return PTR_ERR((void *)memslot->userspace_addr); @@ -3246,8 +3253,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (!old.user_alloc && old.rmap) { int ret; + down_write(¤t->mm->mmap_sem); ret = do_munmap(current->mm, old.userspace_addr, old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); if (ret < 0) printk(KERN_WARNING "kvm_vm_ioctl_set_memory_region: " -- cgit v1.2.2 From a2938c807024ba30191e3bd593430c0659d75717 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Feb 2008 16:30:28 +0100 Subject: KVM: SVM: fix Windows XP 64 bit installation crash While installing Windows XP 64 bit wants to access the DEBUGCTL and the last branch record (LBR) MSRs. Don't allowing this in KVM causes the installation to crash. This patch allow the access to these MSRs and fixes the issue. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d71daabbb51b..1a582f1090e8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1100,6 +1100,24 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_SYSENTER_ESP: *data = svm->vmcb->save.sysenter_esp; break; + /* Nobody will change the following 5 values in the VMCB so + we can safely return them on rdmsr. They will always be 0 + until LBRV is implemented. */ + case MSR_IA32_DEBUGCTLMSR: + *data = svm->vmcb->save.dbgctl; + break; + case MSR_IA32_LASTBRANCHFROMIP: + *data = svm->vmcb->save.br_from; + break; + case MSR_IA32_LASTBRANCHTOIP: + *data = svm->vmcb->save.br_to; + break; + case MSR_IA32_LASTINTFROMIP: + *data = svm->vmcb->save.last_excp_from; + break; + case MSR_IA32_LASTINTTOIP: + *data = svm->vmcb->save.last_excp_to; + break; default: return kvm_get_msr_common(vcpu, ecx, data); } @@ -1160,6 +1178,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_IA32_SYSENTER_ESP: svm->vmcb->save.sysenter_esp = data; break; + case MSR_IA32_DEBUGCTLMSR: + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __FUNCTION__, data); + break; case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: -- cgit v1.2.2 From 5e4a0b3c1b899bb0ba28bde6edf95c5ddeb48b5c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 14 Feb 2008 21:21:43 -0200 Subject: KVM: move alloc_apic_access_page() outside of non-preemptable region alloc_apic_access_page() can sleep, while vmx_vcpu_setup is called inside a non preemptable region. Move it after put_cpu(). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 86f5bf121838..61c2a3a8d20a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1605,9 +1605,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) - return -ENOMEM; return 0; } @@ -2537,6 +2534,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) put_cpu(); if (err) goto free_vmcs; + if (vm_need_virtualize_apic_accesses(kvm)) + if (alloc_apic_access_page(kvm) != 0) + goto free_vmcs; return &vmx->vcpu; -- cgit v1.2.2 From 24993d53495d1f9b844f8eb3ebd1b9efd3521617 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 14 Feb 2008 21:25:39 -0200 Subject: KVM: make MMU_DEBUG compile again the cr3 variable is now inside the vcpu->arch structure. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 26037106ad19..194ece6974e6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1182,7 +1182,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) static void paging_new_cr3(struct kvm_vcpu *vcpu) { - pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); + pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3); mmu_free_roots(vcpu); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2009c6e9dc4d..5588fa594b61 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -143,7 +143,7 @@ walk: } #endif ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); + (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); pt_access = ACC_ALL; -- cgit v1.2.2 From 0b975a3c2d53829fa978e18fabae7d99031f588f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 24 Feb 2008 14:37:50 +0200 Subject: KVM: Avoid infinite-frequency local apic timer If the local apic initial count is zero, don't start a an hrtimer with infinite frequency, locking up the host. Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2cbee9479ce4..68a6b1511934 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -647,6 +647,10 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->timer.period = apic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->timer.divide_count; atomic_set(&apic->timer.pending, 0); + + if (!apic->timer.period) + return; + hrtimer_start(&apic->timer.dev, ktime_add_ns(now, apic->timer.period), HRTIMER_MODE_ABS); -- cgit v1.2.2 From f7d9c7b7b902f9f532738d47593d9679b0b182d9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 26 Feb 2008 22:12:10 +0200 Subject: KVM: MMU: Fix race when instantiating a shadow pte For improved concurrency, the guest walk is performed concurrently with other vcpus. This means that we need to revalidate the guest ptes once we have write-protected the guest page tables, at which point they can no longer be modified. The current code attempts to avoid this check if the shadow page table is not new, on the assumption that if it has existed before, the guest could not have modified the pte without the shadow lock. However the assumption is incorrect, as the racing vcpu could have modified the pte, then instantiated the shadow page, before our vcpu regains control: vcpu0 vcpu1 fault walk pte modify pte fault in same pagetable instantiate shadow page lookup shadow page conclude it is old instantiate spte based on stale guest pte We could do something clever with generation counters, but a test run by Marcelo suggests this is unnecessary and we can just do the revalidation unconditionally. The pte will be in the processor cache and the check can be quite fast. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 12 ++++-------- arch/x86/kvm/paging_tmpl.h | 5 ++--- 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 194ece6974e6..d8172aabc660 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -681,8 +681,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned level, int metaphysical, unsigned access, - u64 *parent_pte, - bool *new_page) + u64 *parent_pte) { union kvm_mmu_page_role role; unsigned index; @@ -722,8 +721,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, vcpu->arch.mmu.prefetch_page(vcpu, sp); if (!metaphysical) rmap_write_protect(vcpu->kvm, gfn); - if (new_page) - *new_page = 1; return sp; } @@ -1006,8 +1003,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, >> PAGE_SHIFT; new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, v, level - 1, - 1, ACC_ALL, &table[index], - NULL); + 1, ACC_ALL, &table[index]); if (!new_table) { pgprintk("nonpaging_map: ENOMEM\n"); kvm_release_page_clean(page); @@ -1100,7 +1096,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); + PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; @@ -1121,7 +1117,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), - ACC_ALL, NULL, NULL); + ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 5588fa594b61..ecc0856268c4 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -300,7 +300,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, u64 shadow_pte; int metaphysical; gfn_t table_gfn; - bool new_page = 0; shadow_ent = ((u64 *)__va(shadow_addr)) + index; if (level == PT_PAGE_TABLE_LEVEL) @@ -322,8 +321,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, metaphysical, access, - shadow_ent, &new_page); - if (new_page && !metaphysical) { + shadow_ent); + if (!metaphysical) { int r; pt_element_t curr_pte; r = kvm_read_guest_atomic(vcpu->kvm, -- cgit v1.2.2 From 33f9c505ed5c83bd8a07877e5b4628308f4cc099 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Feb 2008 16:06:57 +0200 Subject: KVM: VMX: Avoid rearranging switched guest msrs while they are loaded KVM tries to run as much as possible with the guest msrs loaded instead of host msrs, since switching msrs is very expensive. It also tries to minimize the number of msrs switched according to the guest mode; for example, MSR_LSTAR is needed only by long mode guests. This optimization is done by setup_msrs(). However, we must not change which msrs are switched while we are running with guest msr state: - switch to guest msr state - call setup_msrs(), removing some msrs from the list - switch to host msr state, leaving a few guest msrs loaded An easy way to trigger this is to kexec an x86_64 linux guest. Early during setup, the guest will switch EFER to not include SCE. KVM will stop saving MSR_LSTAR, and on the next msr switch it will leave the guest LSTAR loaded. The next host syscall will end up in a random location in the kernel. Fix by reloading the host msrs before changing the msr list. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 61c2a3a8d20a..94ea724638fd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -638,6 +638,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs; + vmx_load_host_state(vmx); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { -- cgit v1.2.2 From 1a4e3f89c6b2cbe0b26c08ec63a8c34156eaae04 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Feb 2008 09:20:08 -0800 Subject: x86: disable KVM for Voyager and friends Most classic Pentiums don't have hardware virtualization extension, and building kvm with Voyager, Visual Workstation, or NUMAQ generates spurious failures. Signed-off-by: Avi Kivity Signed-off-by: Randy Dunlap --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4a88cf7695b4..53800b80a204 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,7 +21,7 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES - select HAVE_KVM + select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) config GENERIC_LOCKBREAK -- cgit v1.2.2 From 18a8622101154277df97e24097ed17aace84fa3a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 3 Mar 2008 13:01:08 -0800 Subject: x86, i387: fix ptrace leakage using init_fpu() This bug got introduced by the recent i387 merge: commit 4421011120b2304e5c248ae4165a2704588aedf1 Author: Roland McGrath Date: Wed Jan 30 13:31:50 2008 +0100 x86: x86 i387 user_regset Current usage of unlazy_fpu() in ptrace specific routines is wrong. unlazy_fpu() will not init fpu if the task never used math. So the ptrace calls can expose the parent tasks FPU data in some cases. Replace it with the init_fpu() which will init the math state, if the task never used math before. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner --- arch/x86/kernel/i387.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 763dfc407232..60fe80157569 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -132,7 +132,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - unlazy_fpu(target); + init_fpu(target); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.i387.fxsave, 0, -1); @@ -147,7 +147,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) return -ENODEV; - unlazy_fpu(target); + init_fpu(target); set_stopped_child_used_math(target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -307,7 +307,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, if (!HAVE_HWFP) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - unlazy_fpu(target); + init_fpu(target); if (!cpu_has_fxsr) return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -332,7 +332,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - unlazy_fpu(target); + init_fpu(target); set_stopped_child_used_math(target); if (!cpu_has_fxsr) -- cgit v1.2.2 From 7c9e92b6cdc9937eee53600e5d49a25e421463dd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Feb 2008 15:35:54 -0800 Subject: x86: not set node to cpu_to_node if the node is not online resolve boot problem reported by Mel Gorman: http://lkml.org/lkml/2008/2/13/404 init_cpu_to_node will use cpu->apic (from MADT or mptable) and apic->node(from SRAT or AMD config space with k8_bus_64.c) to have cpu->node mapping, and later identify_cpu will overwrite them again...(with nearby_node...) this patch checks if the node is online, otherwise it will not update cpu_node map. so keep cpu_node map to online node before identify_cpu..., to prevent possible error. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner --- arch/x86/mm/numa_64.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 59898fb0a4aa..8ccfee10f5b5 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -622,13 +622,17 @@ void __init init_cpu_to_node(void) int i; for (i = 0; i < NR_CPUS; i++) { + int node; u16 apicid = x86_cpu_to_apicid_init[i]; if (apicid == BAD_APICID) continue; - if (apicid_to_node[apicid] == NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE) continue; - numa_set_node(i, apicid_to_node[apicid]); + if (!node_online(node)) + continue; + numa_set_node(i, node); } } -- cgit v1.2.2 From 87d034f3139b5f0d93df2ba58f37d6f2c2c7eeb6 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 28 Feb 2008 23:16:49 +0000 Subject: x86/xen: fix DomU boot problem Construct Xen guest e820 map with a hole between 640K-1M. It's pure luck that Xen kernels have gotten away with it in the past. The patch below seems like the right thing to do. It certainly boots in a domU without the DMI problem (without any of the other related patches such as Alexander's). Signed-off-by: Ian Campbell Cc: H. Peter Anvin Cc: Jeremy Fitzhardinge Tested-by: Mark McLoughlin Acked-by: Mark McLoughlin Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner --- arch/x86/xen/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 3bad4773a2f3..2341492bf7a0 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -38,7 +38,8 @@ char * __init xen_memory_setup(void) unsigned long max_pfn = xen_start_info->nr_pages; e820.nr_map = 0; - add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); + add_memory_region(0, LOWMEMSIZE(), E820_RAM); + add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); return "Xen"; } -- cgit v1.2.2 From fcab59a3186640ce085e89ee6dfc03cacfb6c7c9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 19:33:24 +0000 Subject: x86: a P4 is a P6 not an i486 P4 has been coming out as CPU_FAMILY=4 instead of 6: fix MPENTIUM4 typo. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6d50064db182..9304bfba7d45 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -388,7 +388,7 @@ config X86_OOSTORE # config X86_P6_NOP def_bool y - depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) + depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4) config X86_TSC def_bool y -- cgit v1.2.2 From 9edddaa200df18e08fe0cf21036e8ae467b1363c Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Tue, 4 Mar 2008 14:28:37 -0800 Subject: Kprobes: indicate kretprobe support in Kconfig Add CONFIG_HAVE_KRETPROBES to the arch//Kconfig file for relevant architectures with kprobes support. This facilitates easy handling of in-kernel modules (like samples/kprobes/kretprobe_example.c) that depend on kretprobes being present in the kernel. Thanks to Sam Ravnborg for helping make the patch more lean. Per Mathieu's suggestion, added CONFIG_KRETPROBES and fixed up dependencies. Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Mathieu Desnoyers Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53800b80a204..f41c9538ca30 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,6 +21,7 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_KRETPROBES select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) -- cgit v1.2.2 From 0e5aa8d6218f9914b23e492debf653bda5598af3 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 15 Feb 2008 18:11:14 -0500 Subject: [CPUFREQ] Remove debugging message from e_powersaver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't need to printk a message every time we transition. Leave the code there, but ifdef'd out, as it's useful when adding support for new processors. Reported-by: Petr TitÄ›ra Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/e_powersaver.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c index 39f8cb18296c..c2f930d86640 100644 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c @@ -55,7 +55,6 @@ static int eps_set_state(struct eps_cpu_data *centaur, { struct cpufreq_freqs freqs; u32 lo, hi; - u8 current_multiplier, current_voltage; int err = 0; int i; @@ -95,6 +94,10 @@ postchange: rdmsr(MSR_IA32_PERF_STATUS, lo, hi); freqs.new = centaur->fsb * ((lo >> 8) & 0xff); +#ifdef DEBUG + { + u8 current_multiplier, current_voltage; + /* Print voltage and multiplier */ rdmsr(MSR_IA32_PERF_STATUS, lo, hi); current_voltage = lo & 0xff; @@ -103,7 +106,8 @@ postchange: current_multiplier = (lo >> 8) & 0xff; printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); - + } +#endif cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); return err; } -- cgit v1.2.2 From e40cd10ccff3d9fbffd57b93780bee4b7b9bff51 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Wed, 5 Mar 2008 19:14:24 +0100 Subject: x86: clear DF before calling signal handler The Linux kernel currently does not clear the direction flag before calling a signal handler, whereas the x86/x86-64 ABI requires that. Linux had this behavior/bug forever, but this becomes a real problem with gcc version 4.3, which assumes that the direction flag is correctly cleared at the entry of a function. This patches changes the setup_frame() functions to clear the direction before entering the signal handler. Signed-off-by: Aurelien Jarno Signed-off-by: Ingo Molnar Acked-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 4 ++-- arch/x86/kernel/signal_32.c | 4 ++-- arch/x86/kernel/signal_64.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 1c0503bdfb1a..5e7771a3ba2f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -500,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->ss = __USER32_DS; set_fs(USER_DS); - regs->flags &= ~X86_EFLAGS_TF; + regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF); if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); @@ -600,7 +600,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->ss = __USER32_DS; set_fs(USER_DS); - regs->flags &= ~X86_EFLAGS_TF; + regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF); if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index caee1f002fed..0157a6f0f41f 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -407,7 +407,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, * The tracer may want to single-step inside the * handler too. */ - regs->flags &= ~TF_MASK; + regs->flags &= ~(TF_MASK | X86_EFLAGS_DF); if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); @@ -500,7 +500,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * The tracer may want to single-step inside the * handler too. */ - regs->flags &= ~TF_MASK; + regs->flags &= ~(TF_MASK | X86_EFLAGS_DF); if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 7347bb14e306..56b72fb67f9b 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -295,7 +295,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, see include/asm-x86_64/uaccess.h for details. */ set_fs(USER_DS); - regs->flags &= ~X86_EFLAGS_TF; + regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF); if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG -- cgit v1.2.2 From 609b5297bcfb7b39b7a4137e9ec48407a8c96763 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 5 Mar 2008 08:35:14 +0000 Subject: x86: fix merge mistake in i387.c convert_fxsr_to_user() in 2.6.24's i387_32.c did this, and convert_to_fxsr() also does the inverse, so I assume it's an oversight that it is no longer being done. [ mingo@elte.hu: we encode it this way because there's no space for the 'FPU Last Instruction Opcode' (->fop) field in the legacy user_i387_ia32_struct that PTRACE_GETFPREGS/PTRACE_SETFPREGS uses. it's probably pure legacy - i'd be surprised if any user-space relied on the FPU Last Opcode in any way. But indeed we used to do it previously so the most conservative thing is to preserve that piece of information. ] Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 60fe80157569..d2e39e69aaf8 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -261,7 +261,7 @@ static void convert_from_fxsr(struct user_i387_ia32_struct *env, } #else env->fip = fxsave->fip; - env->fcs = fxsave->fcs; + env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); env->foo = fxsave->foo; env->fos = fxsave->fos; #endif -- cgit v1.2.2 From d032b31a3a22a571cb50c0b5dffbe9ba9328d6e2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 5 Mar 2008 08:36:48 +0000 Subject: x86: fix typo in step.c TIF_DEBUGCTLMSR has no meaning in the actual MSR... Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/step.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 2ef1a5f8d675..9d406cdc847f 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block) child->thread.debugctlmsr | DEBUGCTLMSR_BTF); } else { write_debugctlmsr(child, - child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); + child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); if (!child->thread.debugctlmsr) clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); @@ -189,7 +189,7 @@ void user_disable_single_step(struct task_struct *child) * Make sure block stepping (BTF) is disabled. */ write_debugctlmsr(child, - child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); + child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); if (!child->thread.debugctlmsr) clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); -- cgit v1.2.2 From 7432d149fda8ce9ead9df91e577b83ce52ad5f65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 6 Mar 2008 18:29:43 +0100 Subject: x86: re-add reboot fixups Jan Beulich noticed that the reboot fixups went missing during reboot.c unification. (commit 4d022e35fd7e07c522c7863fee6f07e53cf3fc14) Geode and a few other rare boards with special reboot quirks are affected. Reported-by: Jan Beulich Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 7fd6ac43e4a1..55ceb8cdef75 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -326,6 +326,10 @@ static inline void kb_wait(void) } } +void __attribute__((weak)) mach_reboot_fixups(void) +{ +} + static void native_machine_emergency_restart(void) { int i; @@ -337,6 +341,8 @@ static void native_machine_emergency_restart(void) /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { case BOOT_KBD: + mach_reboot_fixups(); /* for board specific fixups */ + for (i = 0; i < 10; i++) { kb_wait(); udelay(50); -- cgit v1.2.2 From 1722770f131bb5c8e238825f3eba2efa331483a2 Mon Sep 17 00:00:00 2001 From: Peter Korsgaard Date: Thu, 6 Mar 2008 10:56:45 +0100 Subject: x86-boot: don't request VBE2 information The new x86 setup code (4fd06960f120) broke booting on an old P3/500MHz with an onboard Voodoo3 of mine. After debugging it, it turned out to be caused by the fact that the vesa probing now asks for VBE2 data. Disassembing the video BIOS shows that it overflows the vesa_general_info structure when VBE2 data is requested because the source addresses for the information strings which get strcpy'ed to the buffer lie outside the 32K BIOS code (and hence contain long sequences of 0xff's). E.G.: get_vbe_controller_info: 00002A9C 60 pushaw 00002A9D 1E push ds 00002A9E 0E push cs 00002A9F 1F pop ds 00002AA0 2BC9 sub cx,cx 00002AA2 6626813D56424532 cmp dword [es:di],0x32454256 ; "VBE2" 00002AAA 7501 jnz .1 00002AAC 41 inc cx .1: 00002AAD 51 push cx 00002AAE B91400 mov cx,0x14 00002AB1 BED47F mov si, controller_header 00002AB4 57 push di 00002AB5 F3A4 rep movsb ; copy vbe1.2 header 00002AB7 B9EC00 mov cx,0xec 00002ABA 2AC0 sub al,al 00002ABC F3AA rep stosb ; zero pad remainder 00002ABE 5F pop di 00002ABF E8EB0D call word get_memory 00002AC2 C1E002 shl ax,0x2 00002AC5 26894512 mov [es:di+0x12],ax ; total memory 00002AC9 26C745040003 mov word [es:di+0x4],0x300 ; VBE version 00002ACF 268C4D08 mov [es:di+0x8],cs 00002AD3 268C4D10 mov [es:di+0x10],cs 00002AD7 59 pop cx 00002AD8 E361 jcxz .done ; VBE2 requested? 00002ADA 8D9D0001 lea bx,[di+0x100] 00002ADE 53 push bx 00002ADF 87DF xchg bx,di ; di now points to 2nd half 00002AE1 26C747140001 mov word [es:bx+0x14],0x100 ; sw rev 00002AE7 26897F06 mov [es:bx+0x6],di ; oem string 00002AEB 268C4708 mov [es:bx+0x8],es 00002AEF BE5280 mov si,0x8052 ; oem string 00002AF2 E87A1B call word strcpy 00002AF5 26897F0E mov [es:bx+0xe],di ; video mode list 00002AF9 268C4710 mov [es:bx+0x10],es 00002AFD B91E00 mov cx,0x1e 00002B00 BEE87F mov si,vidmodes 00002B03 F3A5 rep movsw 00002B05 26897F16 mov [es:bx+0x16],di ; oem vendor 00002B09 268C4718 mov [es:bx+0x18],es 00002B0D BE2480 mov si,0x8024 ; oem vendor 00002B10 E85C1B call word strcpy 00002B13 26897F1A mov [es:bx+0x1a],di ; oem product 00002B17 268C471C mov [es:bx+0x1c],es 00002B1B BE3880 mov si,0x8038 ; oem product 00002B1E E84E1B call word strcpy 00002B21 26897F1E mov [es:bx+0x1e],di ; oem product rev 00002B25 268C4720 mov [es:bx+0x20],es 00002B29 BE4580 mov si,0x8045 ; oem product rev 00002B2C E8401B call word strcpy 00002B2F 58 pop ax 00002B30 B90001 mov cx,0x100 00002B33 2BCF sub cx,di 00002B35 03C8 add cx,ax 00002B37 2AC0 sub al,al 00002B39 F3AA rep stosb ; zero pad .done: 00002B3B 1F pop ds 00002B3C 61 popaw 00002B3D B84F00 mov ax,0x4f 00002B40 C3 ret (The full BIOS can be found at http://peter.korsgaard.com/vgabios.bin if interested). The old setup code didn't ask for VBE2 info, and the new code doesn't actually do anything with the extra information, so the fix is to simply not request it. Other BIOS'es might have the same problem. Signed-off-by: Peter Korsgaard Signed-off-by: Ingo Molnar --- arch/x86/boot/vesa.h | 9 +-------- arch/x86/boot/video-vesa.c | 2 -- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/vesa.h b/arch/x86/boot/vesa.h index ff5b73cd406f..468e444622c5 100644 --- a/arch/x86/boot/vesa.h +++ b/arch/x86/boot/vesa.h @@ -26,17 +26,10 @@ struct vesa_general_info { far_ptr video_mode_ptr; /* 14 */ u16 total_memory; /* 18 */ - u16 oem_software_rev; /* 20 */ - far_ptr oem_vendor_name_ptr; /* 22 */ - far_ptr oem_product_name_ptr; /* 26 */ - far_ptr oem_product_rev_ptr; /* 30 */ - - u8 reserved[222]; /* 34 */ - u8 oem_data[256]; /* 256 */ + u8 reserved[236]; /* 20 */ } __attribute__ ((packed)); #define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24)) -#define VBE2_MAGIC ('V' + ('B' << 8) + ('E' << 16) + ('2' << 24)) struct vesa_mode_info { u16 mode_attr; /* 0 */ diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 662dd2f13068..419b5c273374 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -37,8 +37,6 @@ static int vesa_probe(void) video_vesa.modes = GET_HEAP(struct mode_info, 0); - vginfo.signature = VBE2_MAGIC; - ax = 0x4f00; di = (size_t)&vginfo; asm(INT10 -- cgit v1.2.2 From 84c6f6046c5a2189160a8f0dca8b90427bf690ea Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 7 Mar 2008 14:56:02 -0800 Subject: x86_64: make ptrace always sign-extend orig_ax to 64 bits This makes 64-bit ptrace calls setting the 64-bit orig_ax field for a 32-bit task sign-extend the low 32 bits up to 64. This matches what a 64-bit debugger expects when tracing a 32-bit task. This follows on my "x86_64 ia32 syscall restart fix". This didn't matter until that was fixed. The debugger ignores or zeros the high half of every register slot it sets (including the orig_rax pseudo-register) uniformly. It expects that the setting of the low 32 bits always has the same meaning as a 32-bit debugger setting those same 32 bits with native 32-bit facilities. This never arose before because the syscall restart check never matched any -ERESTART* values due to lack of sign extension. Before that fix, even 32-bit ptrace setting orig_eax to -1 failed to trigger the restart check anyway. So this was never noticed as a regression of 64-bit debuggers vs 32-bit debuggers on the same 64-bit kernel. Signed-off-by: Roland McGrath [ Changed to just do the sign-extension unconditionally on x86-64, since orig_ax is always just a small integer and doesn't need the full 64-bit range ] Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f41fdc98efb1..8f64abe699fd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -323,6 +323,16 @@ static int putreg(struct task_struct *child, return set_flags(child, value); #ifdef CONFIG_X86_64 + /* + * Orig_ax is really just a flag with small positive and + * negative values, so make sure to always sign-extend it + * from 32 bits so that it works correctly regardless of + * whether we come from a 32-bit environment or not. + */ + case offsetof(struct user_regs_struct, orig_ax): + value = (long) (s32) value; + break; + case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_OF(child)) return -EIO; -- cgit v1.2.2 From 3fabc55f34b72720e8a10aa442bd3415a211edb3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 11 Mar 2008 09:35:56 -0500 Subject: lguest: Sanitize the lguest clock. Now the TSC code handles a zero return from calculate_cpu_khz(), lguest can simply pass through the value it gets from the Host: if non-zero, all the normal TSC code applies. Otherwise (or if the Host really doesn't support TSC), the clocksource code will fall back to the slower but reasonable lguest clock. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 53 ++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index cccb38a59653..9c27c104d83c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -84,7 +84,6 @@ struct lguest_data lguest_data = { .blocked_interrupts = { 1 }, /* Block timer interrupts */ .syscall_vec = SYSCALL_VECTOR, }; -static cycle_t clock_base; /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we @@ -327,8 +326,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ - *dx &= 0x07808101; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ + *dx &= 0x07808111; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls @@ -595,19 +594,25 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } +/* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, + * or 0 if it's unusable as a reliable clock source. This matches what we want + * here: if we return 0 from this function, the x86 TSC clock will not register + * itself. */ +static unsigned long lguest_cpu_khz(void) +{ + return lguest_data.tsc_khz; +} + +/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where + * we read the time value given to us by the Host. */ static cycle_t lguest_clock_read(void) { unsigned long sec, nsec; - /* If the Host tells the TSC speed, we can trust that. */ - if (lguest_data.tsc_khz) - return native_read_tsc(); - - /* If we can't use the TSC, we read the time value written by the Host. - * Since it's in two parts (seconds and nanoseconds), we risk reading - * it just as it's changing from 99 & 0.999999999 to 100 and 0, and - * getting 99 and 0. As Linux tends to come apart under the stress of - * time travel, we must be careful: */ + /* Since the time is in two parts (seconds and nanoseconds), we risk + * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, + * and getting 99 and 0. As Linux tends to come apart under the stress + * of time travel, we must be careful: */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; @@ -622,14 +627,14 @@ static cycle_t lguest_clock_read(void) /* Now if the seconds part has changed, try again. */ } while (unlikely(lguest_data.time.tv_sec != sec)); - /* Our non-TSC clock is in real nanoseconds. */ + /* Our lguest clock is in real nanoseconds. */ return sec*1000000000ULL + nsec; } -/* This is what we tell the kernel is our clocksource. */ +/* This is the fallback clocksource: lower priority than the TSC clocksource. */ static struct clocksource lguest_clock = { .name = "lguest", - .rating = 400, + .rating = 200, .read = lguest_clock_read, .mask = CLOCKSOURCE_MASK(64), .mult = 1 << 22, @@ -637,12 +642,6 @@ static struct clocksource lguest_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* The "scheduler clock" is just our real clock, adjusted to start at zero */ -static unsigned long long lguest_sched_clock(void) -{ - return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base); -} - /* We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I * just applied the patch. */ @@ -712,19 +711,8 @@ static void lguest_time_init(void) /* Set up the timer interrupt (0) to go to our simple timer routine */ set_irq_handler(0, lguest_time_irq); - /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can - * use the TSC, otherwise it's a dumb nanosecond-resolution clock. - * Either way, the "rating" is set so high that it's always chosen over - * any other clocksource. */ - if (lguest_data.tsc_khz) - lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, - lguest_clock.shift); - clock_base = lguest_clock_read(); clocksource_register(&lguest_clock); - /* Now we've set up our clock, we can use it as the scheduler clock */ - pv_time_ops.sched_clock = lguest_sched_clock; - /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ lguest_clockevent.cpumask = cpumask_of_cpu(0); @@ -995,6 +983,7 @@ __init void lguest_init(void) /* time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; + pv_time_ops.get_cpu_khz = lguest_cpu_khz; /* Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). */ -- cgit v1.2.2 From 4357bd9453b81e0a41db1dec16e06d74256b7560 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 11 Mar 2008 09:35:57 -0500 Subject: lguest: Revert 1ce70c4fac3c3954bd48c035f448793867592bc0, fix real problem. Ahmed managed to crash the Host in release_pgd(), which cannot be a Guest bug, and indeed it wasn't. The bug was that handing a 0 as the address of the toplevel page table being manipulated can cause the lookup code in find_pgdir() to return an uninitialized cache entry (we shadow up to 4 top level page tables for each Guest). Commit 37cc8d7f963ba2deec29c9b68716944516a3244f introduced this behaviour in the Guest, uncovering the bug. The patch which he submitted (which removed the /4 from the index calculation) simply ensured that these high-indexed entries hit the early exit path of guest_set_pmd(). But you get lots of segfaults in guest userspace as the PMDs aren't being updated. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 9c27c104d83c..a104c532ff70 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -480,7 +480,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, - (__pa(pmdp)&(PAGE_SIZE-1)), 0); + (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); } /* There are a couple of legacy places where the kernel sets a PTE, but we -- cgit v1.2.2 From f5dbb55b995b77d396fe2204495a0af3e24d28c2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Mar 2008 18:04:34 +0100 Subject: fix BIOS PCI config cycle buglet causing ACPI boot regression I figured out another ACPI related regression today. randconfig testing triggered an early boot-time hang on a laptop of mine (32-bit x86, config attached) - the screen was scrolling ACPI AML exceptions [with no serial port and no early debugging available]. v2.6.24 works fine on that laptop with the same .config, so after a few hours of bisection (had to restart it 3 times - other regressions interacted), it honed in on this commit: | 10270d4838bdc493781f5a1cf2e90e9c34c9142f is first bad commit | | Author: Linus Torvalds | Date: Wed Feb 13 09:56:14 2008 -0800 | | acpi: fix acpi_os_read_pci_configuration() misuse of raw_pci_read() reverting this commit ontop of -rc5 gave a correctly booting kernel. But this commit fixes a real bug so the real question is, why did it break the bootup? After quite some head-scratching, the following change stood out: - pci_id->bus = tu8; + pci_id->bus = val; pci_id->bus is defined as u16: struct acpi_pci_id { u16 segment; u16 bus; ... and 'tu8' changed from u8 to u32. So previously we'd unconditionally mask the return value of acpi_os_read_pci_configuration() (raw_pci_read()) to 8 bits, but now we just trust whatever comes back from the PCI access routines and only crop it to 16 bits. But if the high 8 bits of that result contains any noise then we'll write that into ACPI's PCI ID descriptor and confuse the heck out of the rest of ACPI. So lets check the PCI-BIOS code on that theory. We have this codepath for 8-bit accesses (arch/x86/pci/pcbios.c:pci_bios_read()): switch (len) { case 1: __asm__("lcall *(%%esi); cld\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" : "=c" (*value), "=a" (result) : "1" (PCIBIOS_READ_CONFIG_BYTE), "b" (bx), "D" ((long)reg), "S" (&pci_indirect)); Aha! The "=a" output constraint puts the full 32 bits of EAX into *value. But if the BIOS's routines set any of the high bits to nonzero, we'll return a value with more set in it than intended. The other, more common PCI access methods (v1 and v2 PCI reads) clear out the high bits already, for example pci_conf1_read() does: switch (len) { case 1: *value = inb(0xCFC + (reg & 3)); which explicitly converts the return byte up to 32 bits and zero-extends it. So zero-extending the result in the PCI-BIOS read routine fixes the regression on my laptop. ( It might fix some other long-standing issues we had with PCI-BIOS during the past decade ... ) Both 8-bit and 16-bit accesses were buggy. Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/pci/pcbios.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 10ac8c316c46..2f7109ac4c15 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -198,6 +198,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, "b" (bx), "D" ((long)reg), "S" (&pci_indirect)); + /* + * Zero-extend the result beyond 8 bits, do not trust the + * BIOS having done it: + */ + *value &= 0xff; break; case 2: __asm__("lcall *(%%esi); cld\n\t" @@ -210,6 +215,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, "b" (bx), "D" ((long)reg), "S" (&pci_indirect)); + /* + * Zero-extend the result beyond 16 bits, do not trust the + * BIOS having done it: + */ + *value &= 0xffff; break; case 4: __asm__("lcall *(%%esi); cld\n\t" -- cgit v1.2.2 From 9a46d7e5b63903a70cd96c2c1391a7a26a8dbec9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 Feb 2008 09:30:32 +0100 Subject: x86: ioremap, remove WARN_ON() Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index ac3c959e271d..8fe576baa148 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -134,8 +134,6 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, return NULL; } - WARN_ON_ONCE(page_is_ram(pfn)); - switch (mode) { case IOR_MODE_UNCACHED: default: -- cgit v1.2.2 From 40f0933d51f4cba26a5c009a26bb230f4514c1b6 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 28 Feb 2008 19:57:07 -0800 Subject: x86: ia32 syscall restart fix The code to restart syscalls after signals depends on checking for a negative orig_ax, and for particular negative -ERESTART* values in ax. These fields are 64 bits and for a 32-bit task they get zero-extended. The syscall restart behavior is lost, a regression from a native 32-bit kernel and from 64-bit tasks' behavior. This patch fixes the problem by doing sign-extension where it matters. For orig_ax, the only time the value should be -1 but winds up as 0x0ffffffff is via a 32-bit ptrace call. So the patch changes ptrace to sign-extend the 32-bit orig_eax value when it's stored; it doesn't change the checks on orig_ax, though it uses the new current_syscall() inline to better document the subtle importance of the used of signedness there. The ax value is stored a lot of ways and it seems hard to get them all sign-extended at their origins. So for that, we use the current_syscall_ret() to sign-extend it only for 32-bit tasks at the time of the -ERESTART* comparisons. Signed-off-by: Roland McGrath Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 9 ++++++++- arch/x86/kernel/signal_64.c | 38 +++++++++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8f64abe699fd..d5904eef1d31 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1055,10 +1055,17 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) R32(esi, si); R32(ebp, bp); R32(eax, ax); - R32(orig_eax, orig_ax); R32(eip, ip); R32(esp, sp); + case offsetof(struct user32, regs.orig_eax): + /* + * Sign-extend the value so that orig_eax = -1 + * causes (long)orig_ax < 0 tests to fire correctly. + */ + regs->orig_ax = (long) (s32) value; + break; + case offsetof(struct user32, regs.eflags): return set_flags(child, value); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 56b72fb67f9b..1c83e5124c65 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -310,6 +310,35 @@ give_sigsegv: return -EFAULT; } +/* + * Return -1L or the syscall number that @regs is executing. + */ +static long current_syscall(struct pt_regs *regs) +{ + /* + * We always sign-extend a -1 value being set here, + * so this is always either -1L or a syscall number. + */ + return regs->orig_ax; +} + +/* + * Return a value that is -EFOO if the system call in @regs->orig_ax + * returned an error. This only works for @regs from @current. + */ +static long current_syscall_ret(struct pt_regs *regs) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + return (int) regs->ax; +#endif + return regs->ax; +} + /* * OK, we're invoking a handler */ @@ -327,9 +356,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #endif /* Are we from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (current_syscall(regs) >= 0) { /* If so, check system call restarting.. */ - switch (regs->ax) { + switch (current_syscall_ret(regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -426,10 +455,9 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (current_syscall(regs) >= 0) { /* Restart the system call - no handlers present */ - long res = regs->ax; - switch (res) { + switch (current_syscall_ret(regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: -- cgit v1.2.2 From 985a34bd75cc8c96e43f00dcdda7c3fdb51a3026 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 9 Mar 2008 13:14:37 +0100 Subject: x86: remove quicklists quicklists cause a serious memory leak on 32-bit x86, as documented at: http://bugzilla.kernel.org/show_bug.cgi?id=9991 the reason is that the quicklist pool is a special-purpose cache that grows out of proportion. It is not accounted for anywhere and users have no way to even realize that it's the quicklists that are causing RAM usage spikes. It was supposed to be a relatively small pool, but as demonstrated by KOSAKI Motohiro, they can grow as large as: Quicklists: 1194304 kB given how much trouble this code has caused historically, and given that Andrew objected to its introduction on x86 (years ago), the best option at this point is to remove them. [ any performance benefits of caching constructed pgds should be implemented in a more generic way (possibly within the page allocator), while still allowing constructed pages to be allocated by other workloads. ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 --- arch/x86/mm/pgtable_32.c | 18 +++++++++--------- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f41c9538ca30..237fc128143d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -66,9 +66,6 @@ config MMU config ZONE_DMA def_bool y -config QUICKLIST - def_bool X86_32 - config SBUS bool diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 73aba7125203..2f9e9afcb9f4 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -342,12 +342,16 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - mm->pgd = pgd; /* so that alloc_pd can use it */ + /* so that alloc_pd can use it */ + mm->pgd = pgd; + if (pgd) + pgd_ctor(pgd); if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { - quicklist_free(0, pgd_dtor, pgd); + pgd_dtor(pgd); + free_page((unsigned long)pgd); pgd = NULL; } @@ -357,12 +361,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); - quicklist_free(0, pgd_dtor, pgd); -} - -void check_pgt_cache(void) -{ - quicklist_trim(0, pgd_dtor, 25, 16); + pgd_dtor(pgd); + free_page((unsigned long)pgd); } void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) -- cgit v1.2.2