From cbc34973709eb41b369c304c075cf2069f847012 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 13 Feb 2008 13:14:35 -0800
Subject: lguest: include function prototypes

Added a declaration to asm-x86/lguest.h and moved the extern arrays there
as well.  As an alternative to including asm/lguest.h directly, an
include could be put in linux/lguest.h

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: "rusty@rustcorp.com.au" <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/lguest/boot.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5afdde4895dc..1e613fb03e32 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -57,6 +57,7 @@
 #include <linux/lguest_launcher.h>
 #include <linux/virtio_console.h>
 #include <linux/pm.h>
+#include <asm/lguest.h>
 #include <asm/paravirt.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -75,15 +76,6 @@
  * behaving in simplified but equivalent ways.  In particular, the Guest is the
  * same kernel as the Host (or at least, built from the same source code). :*/
 
-/* Declarations for definitions in lguest_guest.S */
-extern char lguest_noirq_start[], lguest_noirq_end[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_sti[], lgend_sti[];
-extern const char lgstart_popf[], lgend_popf[];
-extern const char lgstart_pushf[], lgend_pushf[];
-extern const char lgstart_iret[], lgend_iret[];
-extern void lguest_iret(void);
-
 struct lguest_data lguest_data = {
 	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
 	.noirq_start = (u32)lguest_noirq_start,
-- 
cgit v1.2.2


From db342d216ba9e060d8c5501eefc1d0a789c9e711 Mon Sep 17 00:00:00 2001
From: Tony Breeds <tony@bakeyournoodle.com>
Date: Tue, 19 Feb 2008 08:16:03 +0100
Subject: lguest: fix build breakage

[ mingo@elte.hu: merged to Rusty's patch ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/asm-offsets_32.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index a33d53017997..8ea040124f7d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -128,13 +128,11 @@ void foo(void)
 	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
 #endif
 
-#ifdef CONFIG_LGUEST_GUEST
+#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
 	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
-#endif
 
-#ifdef CONFIG_LGUEST
 	BLANK();
 	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
 	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
-- 
cgit v1.2.2


From 1ce70c4fac3c3954bd48c035f448793867592bc0 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwish.07@gmail.com>
Date: Sun, 24 Feb 2008 17:55:15 +0200
Subject: x86/lguest: fix pgdir pmd index calculation

Hi all,

Beginning from commits close to v2.6.25-rc2, running lguest always oopses
the host kernel. Oops is at [1].

Bisection led to the following commit:

commit 37cc8d7f963ba2deec29c9b68716944516a3244f

    x86/early_ioremap: don't assume we're using swapper_pg_dir

    At the early stages of boot, before the kernel pagetable has been
    fully initialized, a Xen kernel will still be running off the
    Xen-provided pagetables rather than swapper_pg_dir[].  Therefore,
    readback cr3 to determine the base of the pagetable rather than
    assuming swapper_pg_dir[].

 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
-	pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)];
+	/* Don't assume we're using swapper_pg_dir at this point */
+	pgd_t *base = __va(read_cr3());
+	pgd_t *pgd = &base[pgd_index(addr)];
 	pud_t *pud = pud_offset(pgd, addr);
 	pmd_t *pmd = pmd_offset(pud, addr);

Trying to analyze the problem, it seems on the guest side of lguest,
%cr3 has a different value from &swapper_pg-dir (which
is AFAIK fine on a pravirt guest):

Putting some debugging messages in early_ioremap_pmd:

/* Appears 3 times */
[    0.000000] ***************************
[    0.000000] __va(%cr3) = c0000000, &swapper_pg_dir = c02cc000
[    0.000000] ***************************

After 8 hours of debugging and staring on lguest code, I noticed something
strange in paravirt_ops->set_pmd hypercall invocation:

static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
	*pmdp = pmdval;
	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
}

The first hcall parameter is global pgdir which looks fine. The second
parameter is the pmd index in the pgdir which is suspectful.

AFAIK, calculating the index of pmd does not need a divisoin over four.
Removing the division made lguest work fine again . Patch is at [2].

I am not sure why the division over four existed in the first place. It
seems bogus, maybe the Xen patch just made the problem appear ?

[2]: The patch:

[PATCH] lguest: fix pgdir pmd index cacluation

Remove an error in index calculation which leads to removing
a not existing shadow page table (leading to a Null dereference).

Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lguest/boot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1e613fb03e32..cccb38a59653 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -481,7 +481,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	*pmdp = pmdval;
 	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
-		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+		   (__pa(pmdp)&(PAGE_SIZE-1)), 0);
 }
 
 /* There are a couple of legacy places where the kernel sets a PTE, but we
-- 
cgit v1.2.2


From 92cb54a37a42a41cfb2ef7f1478bfa4395198258 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 Feb 2008 14:37:52 +0100
Subject: x86: make DEBUG_PAGEALLOC and CPA more robust

Use PF_MEMALLOC to prevent recursive calls in the DBEUG_PAGEALLOC
case. This makes the code simpler and more robust against allocation
failures.

This fixes the following fallback to non-mmconfig:

   http://lkml.org/lkml/2008/2/20/551
   http://bugzilla.kernel.org/show_bug.cgi?id=10083

Also, for DEBUG_PAGEALLOC=n reduce the pool size to one page.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pageattr.c | 84 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 464d8fc21ce6..14e48b5a94ba 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -44,6 +44,12 @@ static inline unsigned long highmap_end_pfn(void)
 
 #endif
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
@@ -355,45 +361,48 @@ out_unlock:
 
 static LIST_HEAD(page_pool);
 static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed, pool_refill;
+static unsigned long pool_used, pool_failed;
 
-static void cpa_fill_pool(void)
+static void cpa_fill_pool(struct page **ret)
 {
-	struct page *p;
 	gfp_t gfp = GFP_KERNEL;
+	unsigned long flags;
+	struct page *p;
 
-	/* Do not allocate from interrupt context */
-	if (in_irq() || irqs_disabled())
-		return;
 	/*
-	 * Check unlocked. I does not matter when we have one more
-	 * page in the pool. The bit lock avoids recursive pool
-	 * allocations:
+	 * Avoid recursion (on debug-pagealloc) and also signal
+	 * our priority to get to these pagetables:
 	 */
-	if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill))
+	if (current->flags & PF_MEMALLOC)
 		return;
+	current->flags |= PF_MEMALLOC;
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
 	/*
-	 * We could do:
-	 * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
-	 * but this fails on !PREEMPT kernels
+	 * Allocate atomically from atomic contexts:
 	 */
-	gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-#endif
+	if (in_atomic() || irqs_disabled() || debug_pagealloc)
+		gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
 
-	while (pool_pages < pool_size) {
+	while (pool_pages < pool_size || (ret && !*ret)) {
 		p = alloc_pages(gfp, 0);
 		if (!p) {
 			pool_failed++;
 			break;
 		}
-		spin_lock_irq(&pgd_lock);
+		/*
+		 * If the call site needs a page right now, provide it:
+		 */
+		if (ret && !*ret) {
+			*ret = p;
+			continue;
+		}
+		spin_lock_irqsave(&pgd_lock, flags);
 		list_add(&p->lru, &page_pool);
 		pool_pages++;
-		spin_unlock_irq(&pgd_lock);
+		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
-	clear_bit_unlock(0, &pool_refill);
+
+	current->flags &= ~PF_MEMALLOC;
 }
 
 #define SHIFT_MB		(20 - PAGE_SHIFT)
@@ -414,11 +423,15 @@ void __init cpa_init(void)
 	 * GiB. Shift MiB to Gib and multiply the result by
 	 * POOL_PAGES_PER_GB:
 	 */
-	gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
-	pool_size = POOL_PAGES_PER_GB * gb;
+	if (debug_pagealloc) {
+		gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+		pool_size = POOL_PAGES_PER_GB * gb;
+	} else {
+		pool_size = 1;
+	}
 	pool_low = pool_size;
 
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
 	printk(KERN_DEBUG
 	       "CPA: page pool initialized %lu of %lu pages preallocated\n",
 	       pool_pages, pool_size);
@@ -440,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	spin_lock_irqsave(&pgd_lock, flags);
 	if (list_empty(&page_pool)) {
 		spin_unlock_irqrestore(&pgd_lock, flags);
-		return -ENOMEM;
+		base = NULL;
+		cpa_fill_pool(&base);
+		if (!base)
+			return -ENOMEM;
+		spin_lock_irqsave(&pgd_lock, flags);
+	} else {
+		base = list_first_entry(&page_pool, struct page, lru);
+		list_del(&base->lru);
+		pool_pages--;
+
+		if (pool_pages < pool_low)
+			pool_low = pool_pages;
 	}
 
-	base = list_first_entry(&page_pool, struct page, lru);
-	list_del(&base->lru);
-	pool_pages--;
-
-	if (pool_pages < pool_low)
-		pool_low = pool_pages;
-
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -734,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
 		cpa_flush_all(cache);
 
 out:
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
+
 	return ret;
 }
 
@@ -897,7 +915,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	 * Try to refill the page pool here. We can do this only after
 	 * the tlb flush.
 	 */
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
 }
 
 #ifdef CONFIG_HIBERNATION
-- 
cgit v1.2.2


From b02a7f22f39f02fdf5a1380ff700293639db4490 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 5 Feb 2008 00:48:13 +0100
Subject: x86: hpet fix docbook comment

Signed-off-by: Pavel Machek <Pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 429d084e014d..235fd6c77504 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -368,8 +368,8 @@ static int hpet_clocksource_register(void)
 	return 0;
 }
 
-/*
- * Try to setup the HPET timer
+/**
+ * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
  */
 int __init hpet_enable(void)
 {
-- 
cgit v1.2.2


From a7ef94e6889186848573a10c5bdb8271405f44de Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 14 Feb 2008 14:51:00 -0800
Subject: x86: do not promote TM3x00/TM5x00 to i686-class

We have been promoting Transmeta TM3x00/TM5x00 chips to i686-class
based on the notion that they contain all the user-space visible
features of an i686-class chip.  However, this is not actually true:
they lack the EA-taking long NOPs (0F 1F /0).  Since this is a
userspace-visible incompatibility, downgrade these CPUs to the
manufacturer-defined i586 level.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/transmeta.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 200fb3f9ebfb..e8b422c1c512 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -76,13 +76,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 	/* All Transmeta CPUs have a constant TSC */
 	set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
 	
-	/* If we can run i686 user-space code, call us an i686 */
-#define USER686 ((1 << X86_FEATURE_TSC)|\
-		 (1 << X86_FEATURE_CX8)|\
-		 (1 << X86_FEATURE_CMOV))
-        if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686)
-		c->x86 = 6;
-
 #ifdef CONFIG_SYSCTL
 	/* randomize_va_space slows us down enormously;
 	   it probably triggers retranslation of x86->native bytecode */
-- 
cgit v1.2.2


From 7343b3b3a627eb30e24e921f004f659c8ebb91c5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 14 Feb 2008 14:52:05 -0800
Subject: x86: require family >= 6 if we are using P6 NOPs

The P6 family of NOPs are only available on family >= 6 or above, so
enforce that in the boot code.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.cpu | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e09a6b73a1aa..86fd2a0e4597 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -377,6 +377,10 @@ config X86_OOSTORE
 	def_bool y
 	depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
 
+config X86_P6_NOP
+	def_bool y
+	depends on (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4)
+
 config X86_TSC
 	def_bool y
 	depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
@@ -390,6 +394,7 @@ config X86_CMOV
 config X86_MINIMUM_CPU_FAMILY
 	int
 	default "64" if X86_64
+	default "6" if X86_32 && X86_P6_NOP
 	default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
 	default "3"
 
-- 
cgit v1.2.2


From 959b3be64cab9160cd74532a49b89cdd918d38e9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 14 Feb 2008 14:56:45 -0800
Subject: x86: don't use P6_NOPs if compiling with CONFIG_X86_GENERIC

P6_NOPs are definitely not supported on some VIA CPUs, and possibly
(unverified) on AMD K7s.  It is also the only thing that prevents a
686 kernel from running on Transmeta TM3x00/5x00 (Crusoe) series.

The performance benefit over generic NOPs is very small, so when
building for generic consumption, avoid using them.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.cpu | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 86fd2a0e4597..6d50064db182 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -377,9 +377,18 @@ config X86_OOSTORE
 	def_bool y
 	depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
 
+#
+# P6_NOPs are a relatively minor optimization that require a family >=
+# 6 processor, except that it is broken on certain VIA chips.
+# Furthermore, AMD chips prefer a totally different sequence of NOPs
+# (which work on all CPUs).  As a result, disallow these if we're
+# compiling X86_GENERIC but not X86_64 (these NOPs do work on all
+# x86-64 capable chips); the list of processors in the right-hand clause
+# are the cores that benefit from this optimization.
+#
 config X86_P6_NOP
 	def_bool y
-	depends on (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4)
+	depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4)
 
 config X86_TSC
 	def_bool y
-- 
cgit v1.2.2


From 829157be590af1c2555fb74c3c4db3327e3201fc Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 13 Feb 2008 11:16:46 -0800
Subject: x86: handle BIOSes which terminate e820 with CF=1 and no SMAP

The proper way to terminate the e820 chain is with %ebx == 0 on the
last legitimate memory block.  However, several BIOSes don't do that
and instead return error (CF = 1) when trying to read off the end of
the list.  For this error return, %eax doesn't necessarily return the
SMAP signature -- correctly so, since %ah should contain an error code
in this case.

To deal with some particularly broken BIOSes, we clear the entire e820
chain if the SMAP signature is missing in the middle, indicating a
plain insane e820 implementation.  However, we need to make the test
for CF = 1 before the SMAP check.

This fixes at least one HP laptop (nc6400) for which none of the
memory-probing methods (e820, e801, 88) functioned fully according to
spec.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/boot/memory.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 378353956b5d..e77d89f9e8aa 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -37,6 +37,12 @@ static int detect_memory_e820(void)
 		      "=m" (*desc)
 		    : "D" (desc), "d" (SMAP), "a" (0xe820));
 
+		/* BIOSes which terminate the chain with CF = 1 as opposed
+		   to %ebx = 0 don't always report the SMAP signature on
+		   the final, failing, probe. */
+		if (err)
+			break;
+
 		/* Some BIOSes stop returning SMAP in the middle of
 		   the search loop.  We don't know exactly how the BIOS
 		   screwed up the map at that point, we might have a
@@ -47,9 +53,6 @@ static int detect_memory_e820(void)
 			break;
 		}
 
-		if (err)
-			break;
-
 		count++;
 		desc++;
 	} while (next && count < E820MAX);
-- 
cgit v1.2.2


From f5106d91f2bf9153d6420f9ebb8114f73f9ce66a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 18 Feb 2008 13:10:44 -0800
Subject: x86/mtrr: fix kernel-doc missing notation

Fix mtrr kernel-doc warning:
Warning(linux-2.6.24-git12//arch/x86/kernel/cpu/mtrr/main.c:677): No description found for parameter 'end_pfn'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index b6e136f23d3d..c8fda3eb3d7e 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -649,6 +649,7 @@ static __init int amd_special_default_mtrr(void)
 
 /**
  * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
  *
  * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
  * memory configurations.  This routine checks that the highest MTRR matches
-- 
cgit v1.2.2


From 7265b6f10deba1cbe071cee646b063bda07ecd68 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 19 Feb 2008 11:02:30 +0100
Subject: x86: notsc is ignored on common configurations

notsc is ignored in 32-bit kernels if CONFIG_X86_TSC is on.. which is
bad, fix it.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 43517e324be8..f14cfd9d1f94 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -28,7 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz);
 static int __init tsc_setup(char *str)
 {
 	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-				"cannot disable TSC.\n");
+				"cannot disable TSC completely.\n");
+	mark_tsc_unstable("user disabled TSC");
 	return 1;
 }
 #else
-- 
cgit v1.2.2


From 3b57bc461fd5019aef4cfc77d4faf56ebe95449c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 20 Feb 2008 18:53:17 -0800
Subject: x86: remove double-checking empty zero pages debug

so far no one complained about that.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bb652f5a93fb..6dbb0035c57a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -515,14 +515,6 @@ void __init mem_init(void)
 
 	/* clear_bss() already clear the empty_zero_page */
 
-	/* temporary debugging - double check it's true: */
-	{
-		int i;
-
-		for (i = 0; i < 1024; i++)
-			WARN_ON_ONCE(empty_zero_page[i]);
-	}
-
 	reservedpages = 0;
 
 	/* this will put all low memory onto the freelists */
-- 
cgit v1.2.2


From 88f3aec7afd9ae3e6f6d221801996b69aad1e3a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 21 Feb 2008 11:04:11 +0100
Subject: x86: fix spontaneous reboot with allyesconfig bzImage

recently the 64-bit allyesconfig bzImage kernel started spontaneously
rebooting during early bootup.

after a few fun hours spent with early init debugging, it turns out
that we've got this rather annoying limit on the size of the kernel
image:

      #define KERNEL_TEXT_SIZE  (40*1024*1024)

which limit my vmlinux just happened to pass:

       text           data       bss        dec       hex   filename
   29703744        4222751   8646224   42572719   2899baf   vmlinux

40 MB is 42572719 bytes, so my vmlinux was just 1.5% above this limit :-/

So it happily crashed right in head_64.S, which - as we all know - is
the most debuggable code in the whole architecture ;-)

So increase the limit to allow an up to 128MB kernel image to be mapped.
(should anyone be that crazy or lazy)

We have a full 4K of pagetable (level2_kernel_pgt) allocated for these
mappings already, so there's no RAM overhead and the limit was rather
pointless and arbitrary.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S | 22 ++++++++++++++--------
 arch/x86/mm/init_64.c     |  5 +++--
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index eb415043a929..b037b15be7f8 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -379,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
 	 * Don't set NX because code runs from these pages.
 	 */
-	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
 
 NEXT_PAGE(level2_kernel_pgt)
-	/* 40MB kernel mapping. The kernel code cannot be bigger than that.
-	   When you change this change KERNEL_TEXT_SIZE in page.h too. */
-	/* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
-	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
-	/* Module mapping starts here */
-	.fill	(PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
+	/*
+	 * 128 MB kernel mapping. We spend a full page on this pagetable
+	 * anyway.
+	 *
+	 * The kernel code+data+bss must not be bigger than that.
+	 *
+	 * (NOTE: at +128MB starts the module area, see MODULES_VADDR.
+	 *  If you want to increase this then increase MODULES_VADDR
+	 *  too.)
+	 */
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
+		KERNEL_TEXT_SIZE/PMD_SIZE)
 
 NEXT_PAGE(level2_spare_pgt)
-	.fill   512,8,0
+	.fill   512, 8, 0
 
 #undef PMDS
 #undef NEXT_PAGE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6dbb0035c57a..a02a14f0f324 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -172,8 +172,9 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 }
 
 /*
- * The head.S code sets up the kernel high mapping from:
- * __START_KERNEL_map to __START_KERNEL_map + KERNEL_TEXT_SIZE
+ * The head.S code sets up the kernel high mapping:
+ *
+ *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
  *
  * phys_addr holds the negative offset to the kernel, which is added
  * to the compile time generated pmds. This results in invalid pmds up
-- 
cgit v1.2.2


From d4afe414189b098d56bcd24280c018aa2ac9a990 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 21 Feb 2008 13:39:30 +0100
Subject: x86: rename KERNEL_TEXT_SIZE => KERNEL_IMAGE_SIZE

The KERNEL_TEXT_SIZE constant was mis-named, as we not only map the kernel
text but data, bss and init sections as well.

That name led me on the wrong path with the KERNEL_TEXT_SIZE regression,
because i knew how big of _text_ my images have and i knew about the 40 MB
"text" limit so i wrongly thought to be on the safe side of the 40 MB limit
with my 29 MB of text, while the total image size was slightly above 40 MB.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b037b15be7f8..a007454133a3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -393,7 +393,7 @@ NEXT_PAGE(level2_kernel_pgt)
 	 *  too.)
 	 */
 	PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
-		KERNEL_TEXT_SIZE/PMD_SIZE)
+		KERNEL_IMAGE_SIZE/PMD_SIZE)
 
 NEXT_PAGE(level2_spare_pgt)
 	.fill   512, 8, 0
-- 
cgit v1.2.2


From ce28b9864b853803320c3f1d8de1b81aa4120b14 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 Feb 2008 23:57:30 +0100
Subject: x86: fix vsyscall wreckage

based on a report from Arne Georg Gleditsch about user-space apps
misbehaving after toggling /proc/sys/kernel/vsyscall64, a review
of the code revealed that the "NOP patching" done there is
fundamentally unsafe for a number of reasons:

1) the patching code runs without synchronizing other CPUs

2) it inserts NOPs even if there is no clock source which provides vread

3) when the clock source changes to one without vread we run in
   exactly the same problem as in #2

4) if nobody toggles the proc entry from 1 to 0 and to 1 again, then
   the syscall is not patched out

as a result it is possible to break user-space via this patching.
The only safe thing for now is to remove the patching.

This code was broken since v2.6.21.

Reported-by: Arne Georg Gleditsch <arne.gleditsch@dolphinics.no>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c | 52 +++----------------------------------------
 1 file changed, 3 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3f8242774580..b6be812fac05 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,11 +44,6 @@
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 #define __syscall_clobber "r11","cx","memory"
-#define __pa_vsymbol(x)			\
-	({unsigned long v;  		\
-	extern char __vsyscall_0; 	\
-	  asm("" : "=r" (v) : "0" (x)); \
-	  ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
 
 /*
  * vsyscall_gtod_data contains data that is :
@@ -102,7 +97,7 @@ static __always_inline void do_get_tz(struct timezone * tz)
 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	int ret;
-	asm volatile("vsysc2: syscall"
+	asm volatile("syscall"
 		: "=a" (ret)
 		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
 		: __syscall_clobber );
@@ -112,7 +107,7 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
 static __always_inline long time_syscall(long *t)
 {
 	long secs;
-	asm volatile("vsysc1: syscall"
+	asm volatile("syscall"
 		: "=a" (secs)
 		: "0" (__NR_time),"D" (t) : __syscall_clobber);
 	return secs;
@@ -227,50 +222,10 @@ long __vsyscall(3) venosys_1(void)
 }
 
 #ifdef CONFIG_SYSCTL
-
-#define SYSCALL 0x050f
-#define NOP2    0x9090
-
-/*
- * NOP out syscall in vsyscall page when not needed.
- */
-static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
-                        void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	extern u16 vsysc1, vsysc2;
-	u16 __iomem *map1;
-	u16 __iomem *map2;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-	if (!write)
-		return ret;
-	/* gcc has some trouble with __va(__pa()), so just do it this
-	   way. */
-	map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
-	if (!map1)
-		return -ENOMEM;
-	map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
-	if (!map2) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	if (!vsyscall_gtod_data.sysctl_enabled) {
-		writew(SYSCALL, map1);
-		writew(SYSCALL, map2);
-	} else {
-		writew(NOP2, map1);
-		writew(NOP2, map2);
-	}
-	iounmap(map2);
-out:
-	iounmap(map1);
-	return ret;
-}
-
 static ctl_table kernel_table2[] = {
 	{ .procname = "vsyscall64",
 	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
-	  .mode = 0644,
-	  .proc_handler = vsyscall_sysctl_change },
+	  .mode = 0644 },
 	{}
 };
 
@@ -279,7 +234,6 @@ static ctl_table kernel_root_table2[] = {
 	  .child = kernel_table2 },
 	{}
 };
-
 #endif
 
 /* Assume __initcall executes before all user space. Hopefully kmod
-- 
cgit v1.2.2


From 5d119b2c9a490e2d647eae134211b32a18a04c7d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 Feb 2008 12:55:57 +0100
Subject: x86: fix execve with -fstack-protect

pointed out by pageexec@freemail.hu:

> what happens here is that gcc treats the argument area as owned by the
> callee, not the caller and is allowed to do certain tricks. for ssp it
> will make a copy of the struct passed by value into the local variable
> area and pass *its* address down, and it won't copy it back into the
> original instance stored in the argument area.
>
> so once sys_execve returns, the pt_regs passed by value hasn't at all
> changed and its default content will cause a nice double fault (FWIW,
> this part took me the longest to debug, being down with cold didn't
> help it either ;).

To fix this we pass in pt_regs by pointer.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_64.S   | 6 ++++--
 arch/x86/kernel/process_64.c | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2ad9a1bc6a73..c20c9e7e08dd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -453,6 +453,7 @@ ENTRY(stub_execve)
 	CFI_REGISTER rip, r11
 	SAVE_REST
 	FIXUP_TOP_OF_STACK %r11
+	movq %rsp, %rcx
 	call sys_execve
 	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
@@ -1036,15 +1037,16 @@ ENDPROC(child_rip)
  *	rdi: name, rsi: argv, rdx: envp
  *
  * We want to fallback into:
- *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
  *
  * do_sys_execve asm fallback arguments:
- *	rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
  */
 ENTRY(kernel_execve)
 	CFI_STARTPROC
 	FAKE_STACK_FRAME $0
 	SAVE_ALL	
+	movq %rsp,%rcx
 	call sys_execve
 	movq %rax, RAX(%rsp)	
 	RESTORE_REST
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b0cc8f0136d8..43f287744f9f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -730,16 +730,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  */
 asmlinkage
 long sys_execve(char __user *name, char __user * __user *argv,
-		char __user * __user *envp, struct pt_regs regs)
+		char __user * __user *envp, struct pt_regs *regs)
 {
 	long error;
 	char * filename;
 
 	filename = getname(name);
 	error = PTR_ERR(filename);
-	if (IS_ERR(filename)) 
+	if (IS_ERR(filename))
 		return error;
-	error = do_execve(filename, argv, envp, &regs); 
+	error = do_execve(filename, argv, envp, regs);
 	putname(filename);
 	return error;
 }
-- 
cgit v1.2.2


From 4147c8747eace9058c606b35e700060297edaf91 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Feb 2008 15:50:14 +0100
Subject: x86: don't print a warning when MTRR are blank and running in KVM

Inside a KVM virtual machine the MTRRs are usually blank. This confuses Linux
and causes a warning message at boot. This patch removes that warning message
when running Linux as a KVM guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/main.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c8fda3eb3d7e..be83336fddba 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -43,6 +43,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
+#include <asm/kvm_para.h>
 #include "mtrr.h"
 
 u32 num_var_ranges = 0;
@@ -689,8 +690,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 
 	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
 	if (!highest_pfn) {
-		printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
-		WARN_ON(1);
+		if (!kvm_para_available()) {
+			printk(KERN_WARNING
+				"WARNING: strange, CPU MTRRs all blank?\n");
+			WARN_ON(1);
+		}
 		return 0;
 	}
 
-- 
cgit v1.2.2


From ed2b7e2b1d1ae201afe8fbd111632074b7b53ed4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 22 Feb 2008 21:58:37 +0200
Subject: x86: don't make swapper_pg_pmd global

There doesn't seem to be any reason for swapper_pg_pmd being global.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 25eb98540a41..fd8ca53943a8 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -606,7 +606,7 @@ ENTRY(_stext)
 .section ".bss.page_aligned","wa"
 	.align PAGE_SIZE_asm
 #ifdef CONFIG_X86_PAE
-ENTRY(swapper_pg_pmd)
+swapper_pg_pmd:
 	.fill 1024*KPMDS,4,0
 #else
 ENTRY(swapper_pg_dir)
-- 
cgit v1.2.2


From 1650743cdc0db73478f72c57544ce79ea8f3dda6 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 22 Feb 2008 19:23:58 +0100
Subject: x86: don't save unreliable stack trace entries

Currently, there is no way for print_stack_trace() to determine whether
a given stack trace entry was deemed reliable or not, simply because
save_stack_trace() does not record this information. (Perhaps needless
to say, this makes the saved stack traces A LOT harder to read, and
probably with no other benefits, since debugging features that use
save_stack_trace() most likely also require frame pointers, etc.)

This patch reverts to the old behaviour of only recording the reliable trace
entries for saved stack traces.

Signed-off-by: Vegard Nossum <vegardno@ifi.uio.no>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/stacktrace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 02f0f61f5b11..c28c342c162f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -25,6 +25,8 @@ static int save_stack_stack(void *data, char *name)
 static void save_stack_address(void *data, unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = data;
+	if (!reliable)
+		return;
 	if (trace->skip > 0) {
 		trace->skip--;
 		return;
@@ -37,6 +39,8 @@ static void
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = (struct stack_trace *)data;
+	if (!reliable)
+		return;
 	if (in_sched_functions(addr))
 		return;
 	if (trace->skip > 0) {
-- 
cgit v1.2.2


From 2b775a27c0d9fdf8078d5b31e1e27411e5bf2a91 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Fri, 22 Feb 2008 12:09:29 -0300
Subject: x86: make c_idle.work have a static address.

Currently, c_idle is declared in the stack, and thus, have no static address.

Peter Zijlstra points out this simple solution, in which c_idle.work
is initializated separatedly. Note that the INIT_WORK macro has a static
declaration of a key inside.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Acked-by: Peter Zijlstra <pzijlstr@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index d53bd6fcb428..0880f2c388a9 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -554,10 +554,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	int timeout;
 	unsigned long start_rip;
 	struct create_idle c_idle = {
-		.work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
 		.cpu = cpu,
 		.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
 	};
+	INIT_WORK(&c_idle.work, do_fork_idle);
 
 	/* allocate memory for gdts of secondary cpus. Hotplug is considered */
 	if (!cpu_gdt_descr[cpu].address &&
-- 
cgit v1.2.2


From 03994f01e8b72b3d01fd3d09d1cc7c9f421a727c Mon Sep 17 00:00:00 2001
From: Priit Laes <plaes@plaes.org>
Date: Sun, 24 Feb 2008 18:36:05 +0200
Subject: x86: fix build on non-C locales.

For some locales regex range [a-zA-Z] does not work as it is supposed to.
so we have to use [:alnum:] and [:xdigit:] to make it work as intended.

[1] http://en.wikipedia.org/wiki/Estonian_alphabet

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/vdso/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index f385a4b4a484..b8bd0c4aa02e 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -48,7 +48,7 @@ obj-$(VDSO64-y)			+= vdso-syms.lds
 # Match symbols in the DSO that look like VDSO*; produce a file of constants.
 #
 sed-vdsosym := -e 's/^00*/0/' \
-	-e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
+	-e 's/^\([[:xdigit:]]*\) . \(VDSO[[:alnum:]_]*\)$$/\2 = 0x\1;/p'
 quiet_cmd_vdsosym = VDSOSYM $@
       cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
 
-- 
cgit v1.2.2


From 12c247a6719987aad65f83158d2bb3e73c75c1f5 Mon Sep 17 00:00:00 2001
From: Mikael Pettersson <mikpe@it.uu.se>
Date: Sun, 24 Feb 2008 18:27:03 +0100
Subject: x86: fix boot failure on 486 due to TSC breakage

 > Diffing dmesg between git7 and git8 doesn't sched any light since
 > git8 also removed the printouts of the x86 caps as they were being
 > initialised and updated. I'm currently adding those printouts back
 > in the hope of seeing where and when the caps get broken.

That turned out to be very illuminating:

 --- dmesg-2.6.24-git7	2008-02-24 18:01:25.295851000 +0100
 +++ dmesg-2.6.24-git8	2008-02-24 18:01:25.530358000 +0100
 ...
 CPU: After generic identify, caps: 00000003 00000000 00000000 00000000 00000000 00000000 00000000 00000000

 CPU: After all inits, caps: 00000003 00000000 00000000 00000000 00000000 00000000 00000000 00000000
+CPU: After applying cleared_cpu_caps, caps: 00000013 00000000 00000000 00000000 00000000 00000000 00000000 00000000

Notice how the TSC cap bit goes from Off to On.

(The first two lines are printout loops from -git7 forward-ported
to -git8, the third line is the same printout loop added just after
the xor-with-cleared_cpu_caps[] loop.)

Here's how the breakage occurs:
1. arch/x86/kernel/tsc_32.c:tsc_init() sees !cpu_has_tsc,
   so bails and calls setup_clear_cpu_cap(X86_FEATURE_TSC).
2. include/asm-x86/cpufeature.h:setup_clear_cpu_cap(bit) clears
   the bit in boot_cpu_data and sets it in cleared_cpu_caps
3. arch/x86/kernel/cpu/common.c:identify_cpu() XORs all caps
   in with cleared_cpu_caps
   HOWEVER, at this point c->x86_capability correctly has TSC
   Off, cleared_cpu_caps has TSC On, so the XOR incorrectly
   sets TSC to On in c->x86_capability, with disastrous results.

The real bug is that clearing bits with XOR only works if the
bits are known to be 1 prior to the XOR, and that's not true here.

A simple fix is to convert the XOR to AND-NOT instead. The following
patch does that, and allows my 486 to boot 2.6.25-rc kernels again.

[ mingo@elte.hu: fixed a similar bug in setup_64.c as well. ]

The breakage was introduced via commit 7d851c8d3db0.

Signed-off-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 arch/x86/kernel/setup_64.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f86a3c4a2669..a38aafaefc23 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -504,7 +504,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	/* Clear all flags overriden by options */
 	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] ^= cleared_cpu_caps[i];
+		c->x86_capability[i] &= ~cleared_cpu_caps[i];
 
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6fd804f07821..7637dc91c79b 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -1021,7 +1021,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	/* Clear all flags overriden by options */
 	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] ^= cleared_cpu_caps[i];
+		c->x86_capability[i] &= ~cleared_cpu_caps[i];
 
 #ifdef CONFIG_X86_MCE
 	mcheck_init(c);
-- 
cgit v1.2.2


From 3d00daf44654dc75629caf42816ac4e293658724 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 26 Feb 2008 13:00:18 -0800
Subject: x86: tls prevent_tail_call

Fix a kernel bug (vmware boot problem) reported by Tomasz Grobelny,
which occurs with certain .config variants and gccs.

The x86 TLS cleanup in commit efd1ca52d04d2f6df337a3332cee56cd60e6d4c4
made the sys_set_thread_area and sys_get_thread_area functions ripe for
tail call optimization.  If the compiler chooses to use it for them, it
can clobber the user trap frame because these are asmlinkage functions.

Reported-by: Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tls.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6dfd4e76661a..022bcaa3b42e 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -91,7 +91,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
 
 asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
 {
-	return do_set_thread_area(current, -1, u_info, 1);
+	int ret = do_set_thread_area(current, -1, u_info, 1);
+	prevent_tail_call(ret);
+	return ret;
 }
 
 
@@ -139,7 +141,9 @@ int do_get_thread_area(struct task_struct *p, int idx,
 
 asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
 {
-	return do_get_thread_area(current, -1, u_info);
+	int ret = do_get_thread_area(current, -1, u_info);
+	prevent_tail_call(ret);
+	return ret;
 }
 
 int regset_tls_active(struct task_struct *target,
-- 
cgit v1.2.2


From d67bbacb4b557ece3b41abdcb616354ac0ce00e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Feb 2008 09:39:52 +0100
Subject: x86: restore vsyscall64 prochandler

a recent fix:

  commit ce28b9864b853803320c3f1d8de1b81aa4120b14
  Author: Thomas Gleixner <tglx@linutronix.de>
  Date:   Wed Feb 20 23:57:30 2008 +0100

    x86: fix vsyscall wreckage

removed the broken /kernel/vsyscall64 handler completely.
This triggers the following debug check:

  sysctl table check failed: /kernel/vsyscall64  No proc_handler

Restore the sane part of the proc handler.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index b6be812fac05..edff4c985485 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -222,10 +222,19 @@ long __vsyscall(3) venosys_1(void)
 }
 
 #ifdef CONFIG_SYSCTL
+
+static int
+vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+}
+
 static ctl_table kernel_table2[] = {
 	{ .procname = "vsyscall64",
 	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
-	  .mode = 0644 },
+	  .mode = 0644,
+	  .proc_handler = vsyscall_sysctl_change },
 	{}
 };
 
-- 
cgit v1.2.2


From f2dbe03dccc95f41429d60e4221b02fc0f112cc4 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 27 Feb 2008 11:42:15 -0800
Subject: x86 vdso: fix build locale dependency

Priit Laes discovered that the sed command processing nm output was
sensitive to locale settings.  This was addressed in commit
03994f01e8b72b3d01fd3d09d1cc7c9f421a727c by using [:alnum:] in place of
[a-zA-Z0-9].

But that solution too is locale-dependent and may not always match
the identifiers it needs to.  The better fix is just to run sed et al
with a fixed locale setting in all builds.

Signed-off-by: Roland McGrath <roland@redhat.com>
CC: Priit Laes <plaes@plaes.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/vdso/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b8bd0c4aa02e..0a8f4742ef51 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -48,9 +48,11 @@ obj-$(VDSO64-y)			+= vdso-syms.lds
 # Match symbols in the DSO that look like VDSO*; produce a file of constants.
 #
 sed-vdsosym := -e 's/^00*/0/' \
-	-e 's/^\([[:xdigit:]]*\) . \(VDSO[[:alnum:]_]*\)$$/\2 = 0x\1;/p'
+	-e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
 quiet_cmd_vdsosym = VDSOSYM $@
-      cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
+define cmd_vdsosym
+	$(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
+endef
 
 $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
 	$(call if_changed,vdsosym)
-- 
cgit v1.2.2


From b16bf712f491808a8c926dd481c696fe7d73ee5a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 Feb 2008 14:02:08 +0100
Subject: x86: fix leak un ioremap_page_range() failure

Jan Beulich noticed it during code review that if a driver's ioremap()
fails (say due to -ENOMEM) then we might leak the struct vm_area.

Free it properly.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 882328efc3db..ac3c959e271d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -162,7 +162,7 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 	area->phys_addr = phys_addr;
 	vaddr = (unsigned long) area->addr;
 	if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
-		remove_vm_area((void *)(vaddr & PAGE_MASK));
+		free_vm_area(area);
 		return NULL;
 	}
 
-- 
cgit v1.2.2


From 757265b8c57bb8fd91785d3d1a87fb483c86c9c2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 Feb 2008 20:19:06 +0100
Subject: x86: delay the export removal of init_mm

delay the removal of this symbol export by one more kernel release,
giving external modules such as VirtualBox a chance to stop using it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/init_task.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 5b3ce7934363..3d01e47777db 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,6 +15,7 @@ static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
+EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
 
 /*
  * Initial thread structure.
-- 
cgit v1.2.2


From 8be8f54bae3453588011cad06363813a5293af53 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 23 Feb 2008 20:43:21 +0100
Subject: x86: CPA: avoid split of alias mappings

avoid over-eager large page splitup.

When the target area needs to be split or is split already (ioremap)
then the current code enforces the split of large mappings in the alias
regions even if we could avoid it.

Use a separate variable processed in the cpa_data structure to carry
the number of pages which have been processed instead of reusing the
numpages variable. This keeps numpages intact and gives the alias code
a chance to keep large mappings intact.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 14e48b5a94ba..7049294fb469 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -26,6 +26,7 @@ struct cpa_data {
 	pgprot_t	mask_set;
 	pgprot_t	mask_clr;
 	int		numpages;
+	int		processed;
 	int		flushtlb;
 	unsigned long	pfn;
 };
@@ -290,8 +291,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 */
 	nextpage_addr = (address + psize) & pmask;
 	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
-	if (numpages < cpa->numpages)
-		cpa->numpages = numpages;
+	if (numpages < cpa->processed)
+		cpa->processed = numpages;
 
 	/*
 	 * We are safe now. Check whether the new pgprot is the same:
@@ -318,7 +319,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 */
 	addr = address + PAGE_SIZE;
 	pfn++;
-	for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
+	for (i = 1; i < cpa->processed; i++, addr += PAGE_SIZE, pfn++) {
 		pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
 
 		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
@@ -342,7 +343,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * that we limited the number of possible pages already to
 	 * the number of pages in the large page.
 	 */
-	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+	if (address == (nextpage_addr - psize) && cpa->processed == numpages) {
 		/*
 		 * The address is aligned and the number of pages
 		 * covers the full page.
@@ -572,7 +573,7 @@ repeat:
 			set_pte_atomic(kpte, new_pte);
 			cpa->flushtlb = 1;
 		}
-		cpa->numpages = 1;
+		cpa->processed = 1;
 		return 0;
 	}
 
@@ -583,7 +584,7 @@ repeat:
 	do_split = try_preserve_large_page(kpte, address, cpa);
 	/*
 	 * When the range fits into the existing large page,
-	 * return. cp->numpages and cpa->tlbflush have been updated in
+	 * return. cp->processed and cpa->tlbflush have been updated in
 	 * try_large_page:
 	 */
 	if (do_split <= 0)
@@ -662,7 +663,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		 * Store the remaining nr of pages for the large page
 		 * preservation check.
 		 */
-		cpa->numpages = numpages;
+		cpa->numpages = cpa->processed = numpages;
 
 		ret = __change_page_attr(cpa, checkalias);
 		if (ret)
@@ -679,9 +680,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		 * CPA operation. Either a large page has been
 		 * preserved or a single page update happened.
 		 */
-		BUG_ON(cpa->numpages > numpages);
-		numpages -= cpa->numpages;
-		cpa->vaddr += cpa->numpages * PAGE_SIZE;
+		BUG_ON(cpa->processed > numpages);
+		numpages -= cpa->processed;
+		cpa->vaddr += cpa->processed * PAGE_SIZE;
 	}
 	return 0;
 }
-- 
cgit v1.2.2


From b4ef95de00be4c2c30feccf607a45093c8c118b7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 Feb 2008 09:40:27 +0100
Subject: x86: disable BTS ptrace extensions for now

revert the BTS ptrace extension for now.

based on general objections from Roland McGrath:

    http://lkml.org/lkml/2008/2/21/323

we'll let the BTS functionality cook some more and re-enable
it in v2.6.26. We'll leave the dead code around to help the
development of this code.

(X86_BTS is not defined at the moment)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_32.c |  2 ++
 arch/x86/kernel/process_64.c |  2 ++
 arch/x86/kernel/ptrace.c     | 12 ++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a7d50a547dc2..be3c7a299f02 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -603,11 +603,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	}
 #endif
 
+#ifdef X86_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
 
 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
 
 
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 43f287744f9f..3baf9b9f4c87 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -604,11 +604,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
 
+#ifdef X86_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
 
 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
 }
 
 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index d862e396b099..f41fdc98efb1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -544,6 +544,8 @@ static int ptrace_set_debugreg(struct task_struct *child,
 	return 0;
 }
 
+#ifdef X86_BTS
+
 static int ptrace_bts_get_size(struct task_struct *child)
 {
 	if (!child->thread.ds_area_msr)
@@ -826,6 +828,7 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
 
 	ptrace_bts_write_record(tsk, &rec);
 }
+#endif /* X86_BTS */
 
 /*
  * Called by kernel/ptrace.c when detaching..
@@ -839,7 +842,9 @@ void ptrace_disable(struct task_struct *child)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 	if (child->thread.ds_area_msr) {
+#ifdef X86_BTS
 		ptrace_bts_realloc(child, 0, 0);
+#endif
 		child->thread.debugctlmsr &= ~ds_debugctl_mask();
 		if (!child->thread.debugctlmsr)
 			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
@@ -961,6 +966,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 #endif
 
+	/*
+	 * These bits need more cooking - not enabled yet:
+	 */
+#ifdef X86_BTS
 	case PTRACE_BTS_CONFIG:
 		ret = ptrace_bts_config
 			(child, data, (struct ptrace_bts_config __user *)addr);
@@ -988,6 +997,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		ret = ptrace_bts_drain
 			(child, data, (struct bts_struct __user *) addr);
 		break;
+#endif
 
 	default:
 		ret = ptrace_request(child, request, addr, data);
@@ -1226,12 +1236,14 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 	case PTRACE_SETOPTIONS:
 	case PTRACE_SET_THREAD_AREA:
 	case PTRACE_GET_THREAD_AREA:
+#ifdef X86_BTS
 	case PTRACE_BTS_CONFIG:
 	case PTRACE_BTS_STATUS:
 	case PTRACE_BTS_SIZE:
 	case PTRACE_BTS_GET:
 	case PTRACE_BTS_CLEAR:
 	case PTRACE_BTS_DRAIN:
+#endif
 		return sys_ptrace(request, pid, addr, data);
 
 	default:
-- 
cgit v1.2.2


From d40e705903397445c6861a0a56c23e5b2e8f9b9a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Fri, 29 Feb 2008 18:55:43 +0100
Subject: xen: mask out SEP from CPUID

Fix 32-on-64 pvops kernel:

we don't want userspace using syscall/sysenter, even if the hypervisor
supports it, so mask it out from CPUID.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 49e5358f481a..8b9ee27805fd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -153,6 +153,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 	if (*ax == 1)
 		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
 			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
+			    (1 << X86_FEATURE_SEP)  |  /* disable SEP */
 			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
 
 	asm(XEN_EMULATE_PREFIX "cpuid"
-- 
cgit v1.2.2


From 334df50a866ff7e234c9566960997ca5b9d0a382 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 21 Jan 2008 13:09:33 +0100
Subject: KVM: SVM: Fix lazy FPU switching

If the guest writes to cr0 and leaves the TS flag at 0 while vcpu->fpu_active
is also 0, the TS flag in the guest's cr0 gets lost. This leads to corrupt FPU
state an causes Windows Vista 64bit to crash very soon after boot.  This patch
fixes this bug.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index de755cb1431d..511628916c4b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -792,6 +792,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	vcpu->arch.cr0 = cr0;
 	cr0 |= X86_CR0_PG | X86_CR0_WP;
 	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+	if (!vcpu->fpu_active)
+		cr0 |= X86_CR0_TS;
 	svm->vmcb->save.cr0 = cr0;
 }
 
-- 
cgit v1.2.2


From 6b390b6392309b98fd116b57c2926c44975cde26 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 29 Jan 2008 13:01:27 +0100
Subject: KVM: SVM: set NM intercept when enabling CR0.TS in the guest

Explicitly enable the NM intercept in svm_set_cr0 if we enable TS in the guest
copy of CR0 for lazy FPU switching. This fixes guest SMP with Linux under SVM.
Without that patch Linux deadlocks or panics right after trying to boot the
other CPUs.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 511628916c4b..d71daabbb51b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -792,8 +792,10 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	vcpu->arch.cr0 = cr0;
 	cr0 |= X86_CR0_PG | X86_CR0_WP;
 	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
-	if (!vcpu->fpu_active)
+	if (!vcpu->fpu_active) {
+		svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
 		cr0 |= X86_CR0_TS;
+	}
 	svm->vmcb->save.cr0 = cr0;
 }
 
-- 
cgit v1.2.2


From d730616384211436cfc84e6c2c1aa45351706a96 Mon Sep 17 00:00:00 2001
From: Paul Knowles <paul@transitive.com>
Date: Wed, 6 Feb 2008 11:02:35 +0000
Subject: KVM: Fix kvm_arch_vcpu_ioctl_set_sregs so that set_cr0 works properly

Whilst working on getting a VM to initialize in to IA32e mode I found
this issue. set_cr0 relies on comparing the old cr0 to the new one to
work correctly.  Move the assignment below so the compare can work.

Signed-off-by: Paul Knowles <paul@transitive.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf5308148689..ec60409299a3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2861,8 +2861,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
 
 	mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
-	vcpu->arch.cr0 = sregs->cr0;
 	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+	vcpu->arch.cr0 = sregs->cr0;
 
 	mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-- 
cgit v1.2.2


From 674eea0fc4d1d693250b5d3ddad42ca931c87dfd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 11 Feb 2008 18:37:23 +0200
Subject: KVM: Make the supported cpuid list a host property rather than a vm
 property

One of the use cases for the supported cpuid list is to create a "greatest
common denominator" of cpu capabilities in a server farm.  As such, it is
useful to be able to get the list without creating a virtual machine first.

Since the code does not depend on the vm in any way, all that is needed is
to move it to the device ioctl handler.  The capability identifier is also
changed so that binaries made against -rc1 will fail gracefully.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ec60409299a3..a7069ec2267c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,9 @@
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
+static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+				    struct kvm_cpuid_entry2 __user *entries);
+
 struct kvm_x86_ops *kvm_x86_ops;
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
@@ -727,6 +730,24 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_GET_SUPPORTED_CPUID: {
+		struct kvm_cpuid2 __user *cpuid_arg = argp;
+		struct kvm_cpuid2 cpuid;
+
+		r = -EFAULT;
+		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+			goto out;
+		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
+			cpuid_arg->entries);
+		if (r)
+			goto out;
+
+		r = -EFAULT;
+		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
@@ -974,8 +995,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	put_cpu();
 }
 
-static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
-				    struct kvm_cpuid2 *cpuid,
+static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				    struct kvm_cpuid_entry2 __user *entries)
 {
 	struct kvm_cpuid_entry2 *cpuid_entries;
@@ -1487,24 +1507,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
-	case KVM_GET_SUPPORTED_CPUID: {
-		struct kvm_cpuid2 __user *cpuid_arg = argp;
-		struct kvm_cpuid2 cpuid;
-
-		r = -EFAULT;
-		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-			goto out;
-		r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
-			cpuid_arg->entries);
-		if (r)
-			goto out;
-
-		r = -EFAULT;
-		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
-			goto out;
-		r = 0;
-		break;
-	}
 	default:
 		;
 	}
-- 
cgit v1.2.2


From c7ac679c160db864810920df61a6ed14275011aa Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 11 Feb 2008 20:28:27 +0100
Subject: KVM: emulate access to MSR_IA32_MCG_CTL

Injecting an GP when accessing this MSR lets Windows crash when running some
stress test tools in KVM.  So this patch emulates access to this MSR.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a7069ec2267c..338764fa5391 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -487,6 +487,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
 			__FUNCTION__, data);
 		break;
+	case MSR_IA32_MCG_CTL:
+		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
+			__FUNCTION__, data);
+		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
 	case 0x200 ... 0x2ff: /* MTRRs */
@@ -529,6 +533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MC0_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MCG_CAP:
+	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MC0_MISC:
 	case MSR_IA32_MC0_MISC+4:
 	case MSR_IA32_MC0_MISC+8:
-- 
cgit v1.2.2


From 9b5cf48b06a52c04b85c88642c3b620db8e1d592 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 3 Mar 2008 01:17:37 +0100
Subject: x86: revert "x86: CPA: avoid split of alias mappings"

Revert:

  commit 8be8f54bae3453588011cad06363813a5293af53
  Author: Thomas Gleixner <tglx@linutronix.de>
  Date:   Sat Feb 23 20:43:21 2008 +0100

      x86: CPA: avoid split of alias mappings

because it clearly mishandles the case when __change_page_attr(), called
from __change_page_attr_set_clr(), changes cpa->processed to 1 and
cpa_process_alias(cpa) is executed right after that.

This crashes my x86-64 test box early in the boot process
(ref. http://bugzilla.kernel.org/show_bug.cgi?id=10140#c4).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7049294fb469..14e48b5a94ba 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -26,7 +26,6 @@ struct cpa_data {
 	pgprot_t	mask_set;
 	pgprot_t	mask_clr;
 	int		numpages;
-	int		processed;
 	int		flushtlb;
 	unsigned long	pfn;
 };
@@ -291,8 +290,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 */
 	nextpage_addr = (address + psize) & pmask;
 	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
-	if (numpages < cpa->processed)
-		cpa->processed = numpages;
+	if (numpages < cpa->numpages)
+		cpa->numpages = numpages;
 
 	/*
 	 * We are safe now. Check whether the new pgprot is the same:
@@ -319,7 +318,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 */
 	addr = address + PAGE_SIZE;
 	pfn++;
-	for (i = 1; i < cpa->processed; i++, addr += PAGE_SIZE, pfn++) {
+	for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
 		pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
 
 		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
@@ -343,7 +342,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * that we limited the number of possible pages already to
 	 * the number of pages in the large page.
 	 */
-	if (address == (nextpage_addr - psize) && cpa->processed == numpages) {
+	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
 		/*
 		 * The address is aligned and the number of pages
 		 * covers the full page.
@@ -573,7 +572,7 @@ repeat:
 			set_pte_atomic(kpte, new_pte);
 			cpa->flushtlb = 1;
 		}
-		cpa->processed = 1;
+		cpa->numpages = 1;
 		return 0;
 	}
 
@@ -584,7 +583,7 @@ repeat:
 	do_split = try_preserve_large_page(kpte, address, cpa);
 	/*
 	 * When the range fits into the existing large page,
-	 * return. cp->processed and cpa->tlbflush have been updated in
+	 * return. cp->numpages and cpa->tlbflush have been updated in
 	 * try_large_page:
 	 */
 	if (do_split <= 0)
@@ -663,7 +662,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		 * Store the remaining nr of pages for the large page
 		 * preservation check.
 		 */
-		cpa->numpages = cpa->processed = numpages;
+		cpa->numpages = numpages;
 
 		ret = __change_page_attr(cpa, checkalias);
 		if (ret)
@@ -680,9 +679,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		 * CPA operation. Either a large page has been
 		 * preserved or a single page update happened.
 		 */
-		BUG_ON(cpa->processed > numpages);
-		numpages -= cpa->processed;
-		cpa->vaddr += cpa->processed * PAGE_SIZE;
+		BUG_ON(cpa->numpages > numpages);
+		numpages -= cpa->numpages;
+		cpa->vaddr += cpa->numpages * PAGE_SIZE;
 	}
 	return 0;
 }
-- 
cgit v1.2.2


From 72dc67a69690288538142df73a7e3ac66fea68dc Mon Sep 17 00:00:00 2001
From: Izik Eidus <izike@qumranet.com>
Date: Sun, 10 Feb 2008 18:04:15 +0200
Subject: KVM: remove the usage of the mmap_sem for the protection of the
 memory slots.

This patch replaces the mmap_sem lock for the memory slots with a new
kvm private lock, it is needed beacuse untill now there were cases where
kvm accesses user memory while holding the mmap semaphore.

Signed-off-by: Izik Eidus <izike@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 24 ++++++++++++++---
 arch/x86/kvm/paging_tmpl.h | 13 +++++++---
 arch/x86/kvm/vmx.c         |  7 +++--
 arch/x86/kvm/x86.c         | 65 ++++++++++++++++++++++++++--------------------
 4 files changed, 71 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8efdcdbebb03..26037106ad19 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -876,11 +876,18 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 
 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 {
+	struct page *page;
+
 	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
 
 	if (gpa == UNMAPPED_GVA)
 		return NULL;
-	return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	return page;
 }
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
@@ -1020,15 +1027,18 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 	struct page *page;
 
+	down_read(&vcpu->kvm->slots_lock);
+
 	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(vcpu->kvm, gfn);
+	up_read(&current->mm->mmap_sem);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __nonpaging_map(vcpu, v, write, gfn, page);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 
 	return r;
 }
@@ -1362,6 +1372,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	gfn_t gfn;
 	int r;
 	u64 gpte = 0;
+	struct page *page;
 
 	if (bytes != 4 && bytes != 8)
 		return;
@@ -1389,6 +1400,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	if (!is_present_pte(gpte))
 		return;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(vcpu->kvm, gfn);
+	up_read(&current->mm->mmap_sem);
+
 	vcpu->arch.update_pte.gfn = gfn;
 	vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
 }
@@ -1496,9 +1512,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t gpa;
 	int r;
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 03ba8608fe0f..2009c6e9dc4d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -91,7 +91,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
 	pt_element_t *table;
 	struct page *page;
 
+	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(kvm, table_gfn);
+	up_read(&current->mm->mmap_sem);
+
 	table = kmap_atomic(page, KM_USER0);
 
 	ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -378,7 +381,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (r)
 		return r;
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	/*
 	 * Look up the shadow pte for the faulting address.
 	 */
@@ -392,11 +395,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		pgprintk("%s: guest page fault\n", __FUNCTION__);
 		inject_page_fault(vcpu, addr, walker.error_code);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-		up_read(&current->mm->mmap_sem);
+		up_read(&vcpu->kvm->slots_lock);
 		return 0;
 	}
 
+	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(vcpu->kvm, walker.gfn);
+	up_read(&current->mm->mmap_sem);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
@@ -413,14 +418,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	 */
 	if (shadow_pte && is_io_pte(*shadow_pte)) {
 		spin_unlock(&vcpu->kvm->mmu_lock);
-		up_read(&current->mm->mmap_sem);
+		up_read(&vcpu->kvm->slots_lock);
 		return 1;
 	}
 
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 
 	return write_pt;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad36447e696e..86f5bf121838 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1477,7 +1477,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
 	struct kvm_userspace_memory_region kvm_userspace_mem;
 	int r = 0;
 
-	down_write(&current->mm->mmap_sem);
+	down_write(&kvm->slots_lock);
 	if (kvm->arch.apic_access_page)
 		goto out;
 	kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1487,9 +1487,12 @@ static int alloc_apic_access_page(struct kvm *kvm)
 	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
 	if (r)
 		goto out;
+
+	down_read(&current->mm->mmap_sem);
 	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+	up_read(&current->mm->mmap_sem);
 out:
-	up_write(&current->mm->mmap_sem);
+	up_write(&kvm->slots_lock);
 	return r;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 338764fa5391..6b01552bd1f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -184,7 +184,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	int ret;
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 				  offset * sizeof(u64), sizeof(pdpte));
 	if (ret < 0) {
@@ -201,7 +201,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 out:
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 
 	return ret;
 }
@@ -215,13 +215,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 	if (is_long_mode(vcpu) || !is_pae(vcpu))
 		return false;
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 	if (r < 0)
 		goto out;
 	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 out:
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 
 	return changed;
 }
@@ -359,7 +359,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		 */
 	}
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	/*
 	 * Does the new cr3 value map to physical memory? (Note, we
 	 * catch an invalid cr3 even in real-mode, because it would
@@ -375,7 +375,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		vcpu->arch.cr3 = cr3;
 		vcpu->arch.mmu.new_cr3(vcpu);
 	}
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 }
 EXPORT_SYMBOL_GPL(set_cr3);
 
@@ -1232,12 +1232,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
 		return -EINVAL;
 
-	down_write(&current->mm->mmap_sem);
+	down_write(&kvm->slots_lock);
 
 	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 
-	up_write(&current->mm->mmap_sem);
+	up_write(&kvm->slots_lock);
 	return 0;
 }
 
@@ -1286,7 +1286,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
 	    < alias->target_phys_addr)
 		goto out;
 
-	down_write(&current->mm->mmap_sem);
+	down_write(&kvm->slots_lock);
 
 	p = &kvm->arch.aliases[alias->slot];
 	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1300,7 +1300,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
 
 	kvm_mmu_zap_all(kvm);
 
-	up_write(&current->mm->mmap_sem);
+	up_write(&kvm->slots_lock);
 
 	return 0;
 
@@ -1376,7 +1376,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	struct kvm_memory_slot *memslot;
 	int is_dirty = 0;
 
-	down_write(&current->mm->mmap_sem);
+	down_write(&kvm->slots_lock);
 
 	r = kvm_get_dirty_log(kvm, log, &is_dirty);
 	if (r)
@@ -1392,7 +1392,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	}
 	r = 0;
 out:
-	up_write(&current->mm->mmap_sem);
+	up_write(&kvm->slots_lock);
 	return r;
 }
 
@@ -1570,7 +1570,7 @@ int emulator_read_std(unsigned long addr,
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 		unsigned offset = addr & (PAGE_SIZE-1);
@@ -1592,7 +1592,7 @@ int emulator_read_std(unsigned long addr,
 		addr += tocopy;
 	}
 out:
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 	return r;
 }
 EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1611,9 +1611,9 @@ static int emulator_read_emulated(unsigned long addr,
 		return X86EMUL_CONTINUE;
 	}
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1651,14 +1651,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
 	int ret;
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
 	if (ret < 0) {
-		up_read(&current->mm->mmap_sem);
+		up_read(&vcpu->kvm->slots_lock);
 		return 0;
 	}
 	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 	return 1;
 }
 
@@ -1670,9 +1670,9 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 	struct kvm_io_device *mmio_dev;
 	gpa_t                 gpa;
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 
 	if (gpa == UNMAPPED_GVA) {
 		kvm_inject_page_fault(vcpu, addr, 2);
@@ -1749,7 +1749,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 		char *kaddr;
 		u64 val;
 
-		down_read(&current->mm->mmap_sem);
+		down_read(&vcpu->kvm->slots_lock);
 		gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 
 		if (gpa == UNMAPPED_GVA ||
@@ -1760,13 +1760,17 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 			goto emul_write;
 
 		val = *(u64 *)new;
+
+		down_read(&current->mm->mmap_sem);
 		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+		up_read(&current->mm->mmap_sem);
+
 		kaddr = kmap_atomic(page, KM_USER0);
 		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
 		kunmap_atomic(kaddr, KM_USER0);
 		kvm_release_page_dirty(page);
 	emul_write:
-		up_read(&current->mm->mmap_sem);
+		up_read(&vcpu->kvm->slots_lock);
 	}
 #endif
 
@@ -2159,10 +2163,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
 	for (i = 0; i < nr_pages; ++i) {
-		down_read(&current->mm->mmap_sem);
+		down_read(&vcpu->kvm->slots_lock);
 		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
 		vcpu->arch.pio.guest_pages[i] = page;
-		up_read(&current->mm->mmap_sem);
+		up_read(&vcpu->kvm->slots_lock);
 		if (!page) {
 			kvm_inject_gp(vcpu, 0);
 			free_pio_guest_pages(vcpu);
@@ -2485,8 +2489,9 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
 
 	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-	vcpu->arch.apic->vapic_page = page;
 	up_read(&current->mm->mmap_sem);
+
+	vcpu->arch.apic->vapic_page = page;
 }
 
 static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -2959,9 +2964,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	gpa_t gpa;
 
 	vcpu_load(vcpu);
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 	tr->physical_address = gpa;
 	tr->valid = gpa != UNMAPPED_GVA;
 	tr->writeable = 1;
@@ -3234,11 +3239,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	 */
 	if (!user_alloc) {
 		if (npages && !old.rmap) {
+			down_write(&current->mm->mmap_sem);
 			memslot->userspace_addr = do_mmap(NULL, 0,
 						     npages * PAGE_SIZE,
 						     PROT_READ | PROT_WRITE,
 						     MAP_SHARED | MAP_ANONYMOUS,
 						     0);
+			up_write(&current->mm->mmap_sem);
 
 			if (IS_ERR((void *)memslot->userspace_addr))
 				return PTR_ERR((void *)memslot->userspace_addr);
@@ -3246,8 +3253,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 			if (!old.user_alloc && old.rmap) {
 				int ret;
 
+				down_write(&current->mm->mmap_sem);
 				ret = do_munmap(current->mm, old.userspace_addr,
 						old.npages * PAGE_SIZE);
+				up_write(&current->mm->mmap_sem);
 				if (ret < 0)
 					printk(KERN_WARNING
 				       "kvm_vm_ioctl_set_memory_region: "
-- 
cgit v1.2.2


From a2938c807024ba30191e3bd593430c0659d75717 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 13 Feb 2008 16:30:28 +0100
Subject: KVM: SVM: fix Windows XP 64 bit installation crash

While installing Windows XP 64 bit wants to access the DEBUGCTL and the last
branch record (LBR) MSRs. Don't allowing this in KVM causes the installation to
crash. This patch allow the access to these MSRs and fixes the issue.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d71daabbb51b..1a582f1090e8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1100,6 +1100,24 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_IA32_SYSENTER_ESP:
 		*data = svm->vmcb->save.sysenter_esp;
 		break;
+	/* Nobody will change the following 5 values in the VMCB so
+	   we can safely return them on rdmsr. They will always be 0
+	   until LBRV is implemented. */
+	case MSR_IA32_DEBUGCTLMSR:
+		*data = svm->vmcb->save.dbgctl;
+		break;
+	case MSR_IA32_LASTBRANCHFROMIP:
+		*data = svm->vmcb->save.br_from;
+		break;
+	case MSR_IA32_LASTBRANCHTOIP:
+		*data = svm->vmcb->save.br_to;
+		break;
+	case MSR_IA32_LASTINTFROMIP:
+		*data = svm->vmcb->save.last_excp_from;
+		break;
+	case MSR_IA32_LASTINTTOIP:
+		*data = svm->vmcb->save.last_excp_to;
+		break;
 	default:
 		return kvm_get_msr_common(vcpu, ecx, data);
 	}
@@ -1160,6 +1178,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 	case MSR_IA32_SYSENTER_ESP:
 		svm->vmcb->save.sysenter_esp = data;
 		break;
+	case MSR_IA32_DEBUGCTLMSR:
+		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+				__FUNCTION__, data);
+		break;
 	case MSR_K7_EVNTSEL0:
 	case MSR_K7_EVNTSEL1:
 	case MSR_K7_EVNTSEL2:
-- 
cgit v1.2.2


From 5e4a0b3c1b899bb0ba28bde6edf95c5ddeb48b5c Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <marcelo@kvack.org>
Date: Thu, 14 Feb 2008 21:21:43 -0200
Subject: KVM: move alloc_apic_access_page() outside of non-preemptable region

alloc_apic_access_page() can sleep, while vmx_vcpu_setup is called
inside a non preemptable region. Move it after put_cpu().

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 86f5bf121838..61c2a3a8d20a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1605,9 +1605,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
 	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
 
-	if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-		if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
-			return -ENOMEM;
 
 	return 0;
 }
@@ -2537,6 +2534,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	put_cpu();
 	if (err)
 		goto free_vmcs;
+	if (vm_need_virtualize_apic_accesses(kvm))
+		if (alloc_apic_access_page(kvm) != 0)
+			goto free_vmcs;
 
 	return &vmx->vcpu;
 
-- 
cgit v1.2.2


From 24993d53495d1f9b844f8eb3ebd1b9efd3521617 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <marcelo@kvack.org>
Date: Thu, 14 Feb 2008 21:25:39 -0200
Subject: KVM: make MMU_DEBUG compile again

the cr3 variable is now inside the vcpu->arch structure.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 2 +-
 arch/x86/kvm/paging_tmpl.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 26037106ad19..194ece6974e6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1182,7 +1182,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
-	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
 	mmu_free_roots(vcpu);
 }
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2009c6e9dc4d..5588fa594b61 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -143,7 +143,7 @@ walk:
 	}
 #endif
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+	       (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	pt_access = ACC_ALL;
 
-- 
cgit v1.2.2


From 0b975a3c2d53829fa978e18fabae7d99031f588f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 24 Feb 2008 14:37:50 +0200
Subject: KVM: Avoid infinite-frequency local apic timer

If the local apic initial count is zero, don't start a an hrtimer with infinite
frequency, locking up the host.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/lapic.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2cbee9479ce4..68a6b1511934 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -647,6 +647,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
 	apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
 		    APIC_BUS_CYCLE_NS * apic->timer.divide_count;
 	atomic_set(&apic->timer.pending, 0);
+
+	if (!apic->timer.period)
+		return;
+
 	hrtimer_start(&apic->timer.dev,
 		      ktime_add_ns(now, apic->timer.period),
 		      HRTIMER_MODE_ABS);
-- 
cgit v1.2.2


From f7d9c7b7b902f9f532738d47593d9679b0b182d9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 26 Feb 2008 22:12:10 +0200
Subject: KVM: MMU: Fix race when instantiating a shadow pte

For improved concurrency, the guest walk is performed concurrently with other
vcpus.  This means that we need to revalidate the guest ptes once we have
write-protected the guest page tables, at which point they can no longer be
modified.

The current code attempts to avoid this check if the shadow page table is not
new, on the assumption that if it has existed before, the guest could not have
modified the pte without the shadow lock.  However the assumption is incorrect,
as the racing vcpu could have modified the pte, then instantiated the shadow
page, before our vcpu regains control:

  vcpu0        vcpu1

  fault
  walk pte

               modify pte
               fault in same pagetable
               instantiate shadow page

  lookup shadow page
  conclude it is old
  instantiate spte based on stale guest pte

We could do something clever with generation counters, but a test run by
Marcelo suggests this is unnecessary and we can just do the revalidation
unconditionally.  The pte will be in the processor cache and the check can
be quite fast.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 12 ++++--------
 arch/x86/kvm/paging_tmpl.h |  5 ++---
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 194ece6974e6..d8172aabc660 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -681,8 +681,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     unsigned level,
 					     int metaphysical,
 					     unsigned access,
-					     u64 *parent_pte,
-					     bool *new_page)
+					     u64 *parent_pte)
 {
 	union kvm_mmu_page_role role;
 	unsigned index;
@@ -722,8 +721,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	vcpu->arch.mmu.prefetch_page(vcpu, sp);
 	if (!metaphysical)
 		rmap_write_protect(vcpu->kvm, gfn);
-	if (new_page)
-		*new_page = 1;
 	return sp;
 }
 
@@ -1006,8 +1003,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
 				>> PAGE_SHIFT;
 			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
 						     v, level - 1,
-						     1, ACC_ALL, &table[index],
-						     NULL);
+						     1, ACC_ALL, &table[index]);
 			if (!new_table) {
 				pgprintk("nonpaging_map: ENOMEM\n");
 				kvm_release_page_clean(page);
@@ -1100,7 +1096,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
 		ASSERT(!VALID_PAGE(root));
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
+				      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		vcpu->arch.mmu.root_hpa = root;
@@ -1121,7 +1117,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, !is_paging(vcpu),
-				      ACC_ALL, NULL, NULL);
+				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5588fa594b61..ecc0856268c4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -300,7 +300,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		u64 shadow_pte;
 		int metaphysical;
 		gfn_t table_gfn;
-		bool new_page = 0;
 
 		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
 		if (level == PT_PAGE_TABLE_LEVEL)
@@ -322,8 +321,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
 					       metaphysical, access,
-					       shadow_ent, &new_page);
-		if (new_page && !metaphysical) {
+					       shadow_ent);
+		if (!metaphysical) {
 			int r;
 			pt_element_t curr_pte;
 			r = kvm_read_guest_atomic(vcpu->kvm,
-- 
cgit v1.2.2


From 33f9c505ed5c83bd8a07877e5b4628308f4cc099 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Feb 2008 16:06:57 +0200
Subject: KVM: VMX: Avoid rearranging switched guest msrs while they are loaded

KVM tries to run as much as possible with the guest msrs loaded instead of
host msrs, since switching msrs is very expensive.  It also tries to minimize
the number of msrs switched according to the guest mode; for example,
MSR_LSTAR is needed only by long mode guests.  This optimization is done by
setup_msrs().

However, we must not change which msrs are switched while we are running with
guest msr state:

 - switch to guest msr state
 - call setup_msrs(), removing some msrs from the list
 - switch to host msr state, leaving a few guest msrs loaded

An easy way to trigger this is to kexec an x86_64 linux guest.  Early during
setup, the guest will switch EFER to not include SCE.  KVM will stop saving
MSR_LSTAR, and on the next msr switch it will leave the guest LSTAR loaded.
The next host syscall will end up in a random location in the kernel.

Fix by reloading the host msrs before changing the msr list.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 61c2a3a8d20a..94ea724638fd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -638,6 +638,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 {
 	int save_nmsrs;
 
+	vmx_load_host_state(vmx);
 	save_nmsrs = 0;
 #ifdef CONFIG_X86_64
 	if (is_long_mode(&vmx->vcpu)) {
-- 
cgit v1.2.2


From 1a4e3f89c6b2cbe0b26c08ec63a8c34156eaae04 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 20 Feb 2008 09:20:08 -0800
Subject: x86: disable KVM for Voyager and friends

Most classic Pentiums don't have hardware virtualization extension,
and building kvm with Voyager, Visual Workstation, or NUMAQ
generates spurious failures.

Signed-off-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4a88cf7695b4..53800b80a204 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,7 @@ config X86
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
-	select HAVE_KVM
+	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 
 
 config GENERIC_LOCKBREAK
-- 
cgit v1.2.2


From 18a8622101154277df97e24097ed17aace84fa3a Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 3 Mar 2008 13:01:08 -0800
Subject: x86, i387: fix ptrace leakage using init_fpu()

This bug got introduced by the recent i387 merge:

  commit 4421011120b2304e5c248ae4165a2704588aedf1
  Author: Roland McGrath <roland@redhat.com>
  Date:   Wed Jan 30 13:31:50 2008 +0100

      x86: x86 i387 user_regset

Current usage of unlazy_fpu() in ptrace specific routines is wrong.
unlazy_fpu() will not init fpu if the task never used math. So the
ptrace calls can expose the parent tasks FPU data in some cases.

Replace it with the init_fpu() which will init the math state, if the
task never used math before.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i387.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 763dfc407232..60fe80157569 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -132,7 +132,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_has_fxsr)
 		return -ENODEV;
 
-	unlazy_fpu(target);
+	init_fpu(target);
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				   &target->thread.i387.fxsave, 0, -1);
@@ -147,7 +147,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_has_fxsr)
 		return -ENODEV;
 
-	unlazy_fpu(target);
+	init_fpu(target);
 	set_stopped_child_used_math(target);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
@@ -307,7 +307,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (!HAVE_HWFP)
 		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
 
-	unlazy_fpu(target);
+	init_fpu(target);
 
 	if (!cpu_has_fxsr)
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
@@ -332,7 +332,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (!HAVE_HWFP)
 		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
 
-	unlazy_fpu(target);
+	init_fpu(target);
 	set_stopped_child_used_math(target);
 
 	if (!cpu_has_fxsr)
-- 
cgit v1.2.2


From 7c9e92b6cdc9937eee53600e5d49a25e421463dd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Tue, 19 Feb 2008 15:35:54 -0800
Subject: x86: not set node to cpu_to_node if the node is not online

resolve boot problem reported by Mel Gorman:

   http://lkml.org/lkml/2008/2/13/404

init_cpu_to_node will use cpu->apic (from MADT or mptable) and
apic->node(from SRAT or AMD config space with k8_bus_64.c) to have
cpu->node mapping, and later identify_cpu will overwrite them
again...(with nearby_node...)

this patch checks if the node is online, otherwise it will not
update cpu_node map. so keep cpu_node map to online node before
identify_cpu..., to prevent possible error.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/numa_64.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 59898fb0a4aa..8ccfee10f5b5 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -622,13 +622,17 @@ void __init init_cpu_to_node(void)
 	int i;
 
 	for (i = 0; i < NR_CPUS; i++) {
+		int node;
 		u16 apicid = x86_cpu_to_apicid_init[i];
 
 		if (apicid == BAD_APICID)
 			continue;
-		if (apicid_to_node[apicid] == NUMA_NO_NODE)
+		node = apicid_to_node[apicid];
+		if (node == NUMA_NO_NODE)
 			continue;
-		numa_set_node(i, apicid_to_node[apicid]);
+		if (!node_online(node))
+			continue;
+		numa_set_node(i, node);
 	}
 }
 
-- 
cgit v1.2.2


From 87d034f3139b5f0d93df2ba58f37d6f2c2c7eeb6 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ijc@hellion.org.uk>
Date: Thu, 28 Feb 2008 23:16:49 +0000
Subject: x86/xen: fix DomU boot problem

Construct Xen guest e820 map with a hole between 640K-1M.

It's pure luck that Xen kernels have gotten away with it in the past.

The patch below seems like the right thing to do. It certainly boots in
a domU without the DMI problem (without any of the other related patches
such as Alexander's).

Signed-off-by: Ian Campbell <ijc@hellion.org.uk>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Tested-by: Mark McLoughlin <markmc@redhat.com>
Acked-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/setup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 3bad4773a2f3..2341492bf7a0 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -38,7 +38,8 @@ char * __init xen_memory_setup(void)
 	unsigned long max_pfn = xen_start_info->nr_pages;
 
 	e820.nr_map = 0;
-	add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+	add_memory_region(0, LOWMEMSIZE(), E820_RAM);
+	add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
 
 	return "Xen";
 }
-- 
cgit v1.2.2


From fcab59a3186640ce085e89ee6dfc03cacfb6c7c9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 4 Mar 2008 19:33:24 +0000
Subject: x86: a P4 is a P6 not an i486

P4 has been coming out as CPU_FAMILY=4 instead of 6: fix MPENTIUM4 typo.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig.cpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6d50064db182..9304bfba7d45 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -388,7 +388,7 @@ config X86_OOSTORE
 #
 config X86_P6_NOP
 	def_bool y
-	depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4)
+	depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4)
 
 config X86_TSC
 	def_bool y
-- 
cgit v1.2.2


From 9edddaa200df18e08fe0cf21036e8ae467b1363c Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Tue, 4 Mar 2008 14:28:37 -0800
Subject: Kprobes: indicate kretprobe support in Kconfig

Add CONFIG_HAVE_KRETPROBES to the arch/<arch>/Kconfig file for relevant
architectures with kprobes support.  This facilitates easy handling of
in-kernel modules (like samples/kprobes/kretprobe_example.c) that depend on
kretprobes being present in the kernel.

Thanks to Sam Ravnborg for helping make the patch more lean.

Per Mathieu's suggestion, added CONFIG_KRETPROBES and fixed up dependencies.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 53800b80a204..f41c9538ca30 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,6 +21,7 @@ config X86
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 
 
-- 
cgit v1.2.2


From 0e5aa8d6218f9914b23e492debf653bda5598af3 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 15 Feb 2008 18:11:14 -0500
Subject: [CPUFREQ] Remove debugging message from e_powersaver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We don't need to printk a message every time we transition.
Leave the code there, but ifdef'd out, as it's useful when
adding support for new processors.

Reported-by: Petr Titěra <P.Titera@century.cz>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/e_powersaver.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
index 39f8cb18296c..c2f930d86640 100644
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -55,7 +55,6 @@ static int eps_set_state(struct eps_cpu_data *centaur,
 {
 	struct cpufreq_freqs freqs;
 	u32 lo, hi;
-	u8 current_multiplier, current_voltage;
 	int err = 0;
 	int i;
 
@@ -95,6 +94,10 @@ postchange:
 	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
 	freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
 
+#ifdef DEBUG
+	{
+	u8 current_multiplier, current_voltage;
+
 	/* Print voltage and multiplier */
 	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
 	current_voltage = lo & 0xff;
@@ -103,7 +106,8 @@ postchange:
 	current_multiplier = (lo >> 8) & 0xff;
 	printk(KERN_INFO "eps: Current multiplier = %d\n",
 		current_multiplier);
-
+	}
+#endif
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	return err;
 }
-- 
cgit v1.2.2


From e40cd10ccff3d9fbffd57b93780bee4b7b9bff51 Mon Sep 17 00:00:00 2001
From: Aurelien Jarno <aurelien@aurel32.net>
Date: Wed, 5 Mar 2008 19:14:24 +0100
Subject: x86: clear DF before calling signal handler

The Linux kernel currently does not clear the direction flag before
calling a signal handler, whereas the x86/x86-64 ABI requires that.

Linux had this behavior/bug forever, but this becomes a real problem
with gcc version 4.3, which assumes that the direction flag is
correctly cleared at the entry of a function.

This patches changes the setup_frame() functions to clear the
direction before entering the signal handler.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/ia32/ia32_signal.c | 4 ++--
 arch/x86/kernel/signal_32.c | 4 ++--
 arch/x86/kernel/signal_64.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 1c0503bdfb1a..5e7771a3ba2f 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -500,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 	regs->ss = __USER32_DS;
 
 	set_fs(USER_DS);
-	regs->flags &= ~X86_EFLAGS_TF;
+	regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
@@ -600,7 +600,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->ss = __USER32_DS;
 
 	set_fs(USER_DS);
-	regs->flags &= ~X86_EFLAGS_TF;
+	regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index caee1f002fed..0157a6f0f41f 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -407,7 +407,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	 * The tracer may want to single-step inside the
 	 * handler too.
 	 */
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~(TF_MASK | X86_EFLAGS_DF);
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
@@ -500,7 +500,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	 * The tracer may want to single-step inside the
 	 * handler too.
 	 */
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~(TF_MASK | X86_EFLAGS_DF);
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 7347bb14e306..56b72fb67f9b 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -295,7 +295,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	   see include/asm-x86_64/uaccess.h for details. */
 	set_fs(USER_DS);
 
-	regs->flags &= ~X86_EFLAGS_TF;
+	regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 #ifdef DEBUG_SIG
-- 
cgit v1.2.2


From 609b5297bcfb7b39b7a4137e9ec48407a8c96763 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 5 Mar 2008 08:35:14 +0000
Subject: x86: fix merge mistake in i387.c

convert_fxsr_to_user() in 2.6.24's i387_32.c did this, and
convert_to_fxsr() also does the inverse, so I assume it's an oversight
that it is no longer being done.

[ mingo@elte.hu:

  we encode it this way because there's no space for the 'FPU Last
  Instruction Opcode' (->fop) field in the legacy user_i387_ia32_struct
  that PTRACE_GETFPREGS/PTRACE_SETFPREGS uses.

  it's probably pure legacy - i'd be surprised if any user-space relied on
  the FPU Last Opcode in any way. But indeed we used to do it previously
  so the most conservative thing is to preserve that piece of information.
]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i387.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 60fe80157569..d2e39e69aaf8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -261,7 +261,7 @@ static void convert_from_fxsr(struct user_i387_ia32_struct *env,
 	}
 #else
 	env->fip = fxsave->fip;
-	env->fcs = fxsave->fcs;
+	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
 	env->foo = fxsave->foo;
 	env->fos = fxsave->fos;
 #endif
-- 
cgit v1.2.2


From d032b31a3a22a571cb50c0b5dffbe9ba9328d6e2 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 5 Mar 2008 08:36:48 +0000
Subject: x86: fix typo in step.c

TIF_DEBUGCTLMSR has no meaning in the actual MSR...

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/step.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 2ef1a5f8d675..9d406cdc847f 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
 				  child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
 	} else {
 	    write_debugctlmsr(child,
-			      child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
+			      child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
 
 	    if (!child->thread.debugctlmsr)
 		    clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
@@ -189,7 +189,7 @@ void user_disable_single_step(struct task_struct *child)
 	 * Make sure block stepping (BTF) is disabled.
 	 */
 	write_debugctlmsr(child,
-			  child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
+			  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
 
 	if (!child->thread.debugctlmsr)
 		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-- 
cgit v1.2.2


From 7432d149fda8ce9ead9df91e577b83ce52ad5f65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 6 Mar 2008 18:29:43 +0100
Subject: x86: re-add reboot fixups

Jan Beulich noticed that the reboot fixups went missing during
reboot.c unification.

(commit 4d022e35fd7e07c522c7863fee6f07e53cf3fc14)

Geode and a few other rare boards with special reboot quirks are
affected.

Reported-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 7fd6ac43e4a1..55ceb8cdef75 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -326,6 +326,10 @@ static inline void kb_wait(void)
 	}
 }
 
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
 static void native_machine_emergency_restart(void)
 {
 	int i;
@@ -337,6 +341,8 @@ static void native_machine_emergency_restart(void)
 		/* Could also try the reset bit in the Hammer NB */
 		switch (reboot_type) {
 		case BOOT_KBD:
+			mach_reboot_fixups(); /* for board specific fixups */
+
 			for (i = 0; i < 10; i++) {
 				kb_wait();
 				udelay(50);
-- 
cgit v1.2.2


From 1722770f131bb5c8e238825f3eba2efa331483a2 Mon Sep 17 00:00:00 2001
From: Peter Korsgaard <jacmet@sunsite.dk>
Date: Thu, 6 Mar 2008 10:56:45 +0100
Subject: x86-boot: don't request VBE2 information

The new x86 setup code (4fd06960f120) broke booting on an old P3/500MHz
with an onboard Voodoo3 of mine. After debugging it, it turned out
to be caused by the fact that the vesa probing now asks for VBE2 data.

Disassembing the video BIOS shows that it overflows the vesa_general_info
structure when VBE2 data is requested because the source addresses for the
information strings which get strcpy'ed to the buffer lie outside the 32K
BIOS code (and hence contain long sequences of 0xff's).

E.G.:

get_vbe_controller_info:
00002A9C  60                pushaw
00002A9D  1E                push ds
00002A9E  0E                push cs
00002A9F  1F                pop ds
00002AA0  2BC9              sub cx,cx
00002AA2  6626813D56424532  cmp dword [es:di],0x32454256 ; "VBE2"
00002AAA  7501              jnz .1
00002AAC  41                inc cx
.1:
00002AAD  51                push cx
00002AAE  B91400            mov cx,0x14
00002AB1  BED47F            mov si, controller_header
00002AB4  57                push di
00002AB5  F3A4              rep movsb ; copy vbe1.2 header

00002AB7  B9EC00            mov cx,0xec
00002ABA  2AC0              sub al,al
00002ABC  F3AA              rep stosb ; zero pad remainder

00002ABE  5F                pop di
00002ABF  E8EB0D            call word get_memory
00002AC2  C1E002            shl ax,0x2
00002AC5  26894512          mov [es:di+0x12],ax ; total memory
00002AC9  26C745040003      mov word [es:di+0x4],0x300 ; VBE version
00002ACF  268C4D08          mov [es:di+0x8],cs
00002AD3  268C4D10          mov [es:di+0x10],cs
00002AD7  59                pop cx
00002AD8  E361              jcxz .done ; VBE2 requested?
00002ADA  8D9D0001          lea bx,[di+0x100]
00002ADE  53                push bx
00002ADF  87DF              xchg bx,di ; di now points to 2nd half
00002AE1  26C747140001      mov word [es:bx+0x14],0x100 ; sw rev

00002AE7  26897F06          mov [es:bx+0x6],di		; oem string
00002AEB  268C4708          mov [es:bx+0x8],es
00002AEF  BE5280            mov si,0x8052 ; oem string
00002AF2  E87A1B            call word strcpy

00002AF5  26897F0E          mov [es:bx+0xe],di ; video mode list
00002AF9  268C4710          mov [es:bx+0x10],es
00002AFD  B91E00            mov cx,0x1e
00002B00  BEE87F            mov si,vidmodes
00002B03  F3A5              rep movsw

00002B05  26897F16          mov [es:bx+0x16],di ; oem vendor
00002B09  268C4718          mov [es:bx+0x18],es
00002B0D  BE2480            mov si,0x8024 ; oem vendor
00002B10  E85C1B            call word strcpy

00002B13  26897F1A          mov [es:bx+0x1a],di ; oem product
00002B17  268C471C          mov [es:bx+0x1c],es
00002B1B  BE3880            mov si,0x8038 ; oem product
00002B1E  E84E1B            call word strcpy

00002B21  26897F1E          mov [es:bx+0x1e],di ; oem product rev
00002B25  268C4720          mov [es:bx+0x20],es
00002B29  BE4580            mov si,0x8045 ; oem product rev
00002B2C  E8401B            call word strcpy

00002B2F  58                pop ax
00002B30  B90001            mov cx,0x100
00002B33  2BCF              sub cx,di
00002B35  03C8              add cx,ax
00002B37  2AC0              sub al,al
00002B39  F3AA              rep stosb ; zero pad
.done:
00002B3B  1F                pop ds
00002B3C  61                popaw
00002B3D  B84F00            mov ax,0x4f
00002B40  C3                ret

(The full BIOS can be found at http://peter.korsgaard.com/vgabios.bin
if interested).

The old setup code didn't ask for VBE2 info, and the new code doesn't
actually do anything with the extra information, so the fix is to simply
not request it. Other BIOS'es might have the same problem.

Signed-off-by: Peter Korsgaard <jacmet@sunsite.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/boot/vesa.h       | 9 +--------
 arch/x86/boot/video-vesa.c | 2 --
 2 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/vesa.h b/arch/x86/boot/vesa.h
index ff5b73cd406f..468e444622c5 100644
--- a/arch/x86/boot/vesa.h
+++ b/arch/x86/boot/vesa.h
@@ -26,17 +26,10 @@ struct vesa_general_info {
 	far_ptr video_mode_ptr;	/* 14 */
 	u16 total_memory;	/* 18 */
 
-	u16 oem_software_rev;	/* 20 */
-	far_ptr oem_vendor_name_ptr;	/* 22 */
-	far_ptr oem_product_name_ptr;	/* 26 */
-	far_ptr oem_product_rev_ptr;	/* 30 */
-
-	u8 reserved[222];	/* 34 */
-	u8 oem_data[256];	/* 256 */
+	u8 reserved[236];	/* 20 */
 } __attribute__ ((packed));
 
 #define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
-#define VBE2_MAGIC ('V' + ('B' << 8) + ('E' << 16) + ('2' << 24))
 
 struct vesa_mode_info {
 	u16 mode_attr;		/* 0 */
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 662dd2f13068..419b5c273374 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -37,8 +37,6 @@ static int vesa_probe(void)
 
 	video_vesa.modes = GET_HEAP(struct mode_info, 0);
 
-	vginfo.signature = VBE2_MAGIC;
-
 	ax = 0x4f00;
 	di = (size_t)&vginfo;
 	asm(INT10
-- 
cgit v1.2.2


From 84c6f6046c5a2189160a8f0dca8b90427bf690ea Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 7 Mar 2008 14:56:02 -0800
Subject: x86_64: make ptrace always sign-extend orig_ax to 64 bits

This makes 64-bit ptrace calls setting the 64-bit orig_ax field for a
32-bit task sign-extend the low 32 bits up to 64.  This matches what a
64-bit debugger expects when tracing a 32-bit task.

This follows on my "x86_64 ia32 syscall restart fix".  This didn't
matter until that was fixed.

The debugger ignores or zeros the high half of every register slot it
sets (including the orig_rax pseudo-register) uniformly.  It expects
that the setting of the low 32 bits always has the same meaning as a
32-bit debugger setting those same 32 bits with native 32-bit
facilities.

This never arose before because the syscall restart check never
matched any -ERESTART* values due to lack of sign extension.  Before
that fix, even 32-bit ptrace setting orig_eax to -1 failed to trigger
the restart check anyway.  So this was never noticed as a regression
of 64-bit debuggers vs 32-bit debuggers on the same 64-bit kernel.

Signed-off-by: Roland McGrath <roland@redhat.com>
[ Changed to just do the sign-extension unconditionally on x86-64,
  since orig_ax is always just a small integer and doesn't need
  the full 64-bit range ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/ptrace.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f41fdc98efb1..8f64abe699fd 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -323,6 +323,16 @@ static int putreg(struct task_struct *child,
 		return set_flags(child, value);
 
 #ifdef CONFIG_X86_64
+	/*
+	 * Orig_ax is really just a flag with small positive and
+	 * negative values, so make sure to always sign-extend it
+	 * from 32 bits so that it works correctly regardless of
+	 * whether we come from a 32-bit environment or not.
+	 */
+	case offsetof(struct user_regs_struct, orig_ax):
+		value = (long) (s32) value;
+		break;
+
 	case offsetof(struct user_regs_struct,fs_base):
 		if (value >= TASK_SIZE_OF(child))
 			return -EIO;
-- 
cgit v1.2.2


From 3fabc55f34b72720e8a10aa442bd3415a211edb3 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 11 Mar 2008 09:35:56 -0500
Subject: lguest: Sanitize the lguest clock.

Now the TSC code handles a zero return from calculate_cpu_khz(),
lguest can simply pass through the value it gets from the Host: if
non-zero, all the normal TSC code applies.

Otherwise (or if the Host really doesn't support TSC), the clocksource
code will fall back to the slower but reasonable lguest clock.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 53 ++++++++++++++++++++------------------------------
 1 file changed, 21 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index cccb38a59653..9c27c104d83c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -84,7 +84,6 @@ struct lguest_data lguest_data = {
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
 	.syscall_vec = SYSCALL_VECTOR,
 };
-static cycle_t clock_base;
 
 /*G:037 async_hcall() is pretty simple: I'm quite proud of it really.  We have a
  * ring buffer of stored hypercalls which the Host will run though next time we
@@ -327,8 +326,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 	case 1:	/* Basic feature request. */
 		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
 		*cx &= 0x00002201;
-		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
-		*dx &= 0x07808101;
+		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */
+		*dx &= 0x07808111;
 		/* The Host can do a nice optimization if it knows that the
 		 * kernel mappings (addresses above 0xC0000000 or whatever
 		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
@@ -595,19 +594,25 @@ static unsigned long lguest_get_wallclock(void)
 	return lguest_data.time.tv_sec;
 }
 
+/* The TSC is a Time Stamp Counter.  The Host tells us what speed it runs at,
+ * or 0 if it's unusable as a reliable clock source.  This matches what we want
+ * here: if we return 0 from this function, the x86 TSC clock will not register
+ * itself. */
+static unsigned long lguest_cpu_khz(void)
+{
+	return lguest_data.tsc_khz;
+}
+
+/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where
+ * we read the time value given to us by the Host. */
 static cycle_t lguest_clock_read(void)
 {
 	unsigned long sec, nsec;
 
-	/* If the Host tells the TSC speed, we can trust that. */
-	if (lguest_data.tsc_khz)
-		return native_read_tsc();
-
-	/* If we can't use the TSC, we read the time value written by the Host.
-	 * Since it's in two parts (seconds and nanoseconds), we risk reading
-	 * it just as it's changing from 99 & 0.999999999 to 100 and 0, and
-	 * getting 99 and 0.  As Linux tends to come apart under the stress of
-	 * time travel, we must be careful: */
+	/* Since the time is in two parts (seconds and nanoseconds), we risk
+	 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
+	 * and getting 99 and 0.  As Linux tends to come apart under the stress
+	 * of time travel, we must be careful: */
 	do {
 		/* First we read the seconds part. */
 		sec = lguest_data.time.tv_sec;
@@ -622,14 +627,14 @@ static cycle_t lguest_clock_read(void)
 		/* Now if the seconds part has changed, try again. */
 	} while (unlikely(lguest_data.time.tv_sec != sec));
 
-	/* Our non-TSC clock is in real nanoseconds. */
+	/* Our lguest clock is in real nanoseconds. */
 	return sec*1000000000ULL + nsec;
 }
 
-/* This is what we tell the kernel is our clocksource.  */
+/* This is the fallback clocksource: lower priority than the TSC clocksource. */
 static struct clocksource lguest_clock = {
 	.name		= "lguest",
-	.rating		= 400,
+	.rating		= 200,
 	.read		= lguest_clock_read,
 	.mask		= CLOCKSOURCE_MASK(64),
 	.mult		= 1 << 22,
@@ -637,12 +642,6 @@ static struct clocksource lguest_clock = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-/* The "scheduler clock" is just our real clock, adjusted to start at zero */
-static unsigned long long lguest_sched_clock(void)
-{
-	return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base);
-}
-
 /* We also need a "struct clock_event_device": Linux asks us to set it to go
  * off some time in the future.  Actually, James Morris figured all this out, I
  * just applied the patch. */
@@ -712,19 +711,8 @@ static void lguest_time_init(void)
 	/* Set up the timer interrupt (0) to go to our simple timer routine */
 	set_irq_handler(0, lguest_time_irq);
 
-	/* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can
-	 * use the TSC, otherwise it's a dumb nanosecond-resolution clock.
-	 * Either way, the "rating" is set so high that it's always chosen over
-	 * any other clocksource. */
-	if (lguest_data.tsc_khz)
-		lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
-							 lguest_clock.shift);
-	clock_base = lguest_clock_read();
 	clocksource_register(&lguest_clock);
 
-	/* Now we've set up our clock, we can use it as the scheduler clock */
-	pv_time_ops.sched_clock = lguest_sched_clock;
-
 	/* We can't set cpumask in the initializer: damn C limitations!  Set it
 	 * here and register our timer device. */
 	lguest_clockevent.cpumask = cpumask_of_cpu(0);
@@ -995,6 +983,7 @@ __init void lguest_init(void)
 	/* time operations */
 	pv_time_ops.get_wallclock = lguest_get_wallclock;
 	pv_time_ops.time_init = lguest_time_init;
+	pv_time_ops.get_cpu_khz = lguest_cpu_khz;
 
 	/* Now is a good time to look at the implementations of these functions
 	 * before returning to the rest of lguest_init(). */
-- 
cgit v1.2.2


From 4357bd9453b81e0a41db1dec16e06d74256b7560 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 11 Mar 2008 09:35:57 -0500
Subject: lguest: Revert 1ce70c4fac3c3954bd48c035f448793867592bc0, fix real
 problem.

Ahmed managed to crash the Host in release_pgd(), which cannot be a Guest
bug, and indeed it wasn't.

The bug was that handing a 0 as the address of the toplevel page table
being manipulated can cause the lookup code in find_pgdir() to return
an uninitialized cache entry (we shadow up to 4 top level page tables
for each Guest).

Commit 37cc8d7f963ba2deec29c9b68716944516a3244f introduced this
behaviour in the Guest, uncovering the bug.

The patch which he submitted (which removed the /4 from the index
calculation) simply ensured that these high-indexed entries hit the
early exit path of guest_set_pmd().  But you get lots of segfaults in
guest userspace as the PMDs aren't being updated.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9c27c104d83c..a104c532ff70 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -480,7 +480,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	*pmdp = pmdval;
 	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
-		   (__pa(pmdp)&(PAGE_SIZE-1)), 0);
+		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
 }
 
 /* There are a couple of legacy places where the kernel sets a PTE, but we
-- 
cgit v1.2.2


From f5dbb55b995b77d396fe2204495a0af3e24d28c2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 10 Mar 2008 18:04:34 +0100
Subject: fix BIOS PCI config cycle buglet causing ACPI boot regression

I figured out another ACPI related regression today.

randconfig testing triggered an early boot-time hang on a laptop of mine
(32-bit x86, config attached) - the screen was scrolling ACPI AML
exceptions [with no serial port and no early debugging available].

v2.6.24 works fine on that laptop with the same .config, so after a few
hours of bisection (had to restart it 3 times - other regressions
interacted), it honed in on this commit:

| 10270d4838bdc493781f5a1cf2e90e9c34c9142f is first bad commit
|
| Author: Linus Torvalds <torvalds@woody.linux-foundation.org>
| Date:   Wed Feb 13 09:56:14 2008 -0800
|
|     acpi: fix acpi_os_read_pci_configuration() misuse of raw_pci_read()

reverting this commit ontop of -rc5 gave a correctly booting kernel.

But this commit fixes a real bug so the real question is, why did it
break the bootup?

After quite some head-scratching, the following change stood out:

-                               pci_id->bus = tu8;
+                               pci_id->bus = val;

pci_id->bus is defined as u16:

   struct acpi_pci_id {
           u16 segment;
           u16 bus;
   ...

and 'tu8' changed from u8 to u32. So previously we'd unconditionally
mask the return value of acpi_os_read_pci_configuration()
(raw_pci_read()) to 8 bits, but now we just trust whatever comes back
from the PCI access routines and only crop it to 16 bits.

But if the high 8 bits of that result contains any noise then we'll
write that into ACPI's PCI ID descriptor and confuse the heck out of the
rest of ACPI.

So lets check the PCI-BIOS code on that theory. We have this codepath
for 8-bit accesses (arch/x86/pci/pcbios.c:pci_bios_read()):

        switch (len) {
        case 1:
                __asm__("lcall *(%%esi); cld\n\t"
                        "jc 1f\n\t"
                        "xor %%ah, %%ah\n"
                        "1:"
                        : "=c" (*value),
                          "=a" (result)
                        : "1" (PCIBIOS_READ_CONFIG_BYTE),
                          "b" (bx),
                          "D" ((long)reg),
                          "S" (&pci_indirect));

Aha! The "=a" output constraint puts the full 32 bits of EAX into
*value. But if the BIOS's routines set any of the high bits to nonzero,
we'll return a value with more set in it than intended.

The other, more common PCI access methods (v1 and v2 PCI reads) clear
out the high bits already, for example pci_conf1_read() does:

        switch (len) {
        case 1:
                *value = inb(0xCFC + (reg & 3));

which explicitly converts the return byte up to 32 bits and zero-extends
it.

So zero-extending the result in the PCI-BIOS read routine fixes the
regression on my laptop. ( It might fix some other long-standing issues
we had with PCI-BIOS during the past decade ... ) Both 8-bit and 16-bit
accesses were buggy.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/pci/pcbios.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 10ac8c316c46..2f7109ac4c15 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -198,6 +198,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 			  "b" (bx),
 			  "D" ((long)reg),
 			  "S" (&pci_indirect));
+		/*
+		 * Zero-extend the result beyond 8 bits, do not trust the
+		 * BIOS having done it:
+		 */
+		*value &= 0xff;
 		break;
 	case 2:
 		__asm__("lcall *(%%esi); cld\n\t"
@@ -210,6 +215,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 			  "b" (bx),
 			  "D" ((long)reg),
 			  "S" (&pci_indirect));
+		/*
+		 * Zero-extend the result beyond 16 bits, do not trust the
+		 * BIOS having done it:
+		 */
+		*value &= 0xffff;
 		break;
 	case 4:
 		__asm__("lcall *(%%esi); cld\n\t"
-- 
cgit v1.2.2


From 9a46d7e5b63903a70cd96c2c1391a7a26a8dbec9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 Feb 2008 09:30:32 +0100
Subject: x86: ioremap, remove WARN_ON()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index ac3c959e271d..8fe576baa148 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -134,8 +134,6 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 			return NULL;
 	}
 
-	WARN_ON_ONCE(page_is_ram(pfn));
-
 	switch (mode) {
 	case IOR_MODE_UNCACHED:
 	default:
-- 
cgit v1.2.2


From 40f0933d51f4cba26a5c009a26bb230f4514c1b6 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 28 Feb 2008 19:57:07 -0800
Subject: x86: ia32 syscall restart fix

The code to restart syscalls after signals depends on checking for a
negative orig_ax, and for particular negative -ERESTART* values in ax.
These fields are 64 bits and for a 32-bit task they get zero-extended.
The syscall restart behavior is lost, a regression from a native 32-bit
kernel and from 64-bit tasks' behavior.

This patch fixes the problem by doing sign-extension where it matters.

For orig_ax, the only time the value should be -1 but winds up as
0x0ffffffff is via a 32-bit ptrace call. So the patch changes ptrace to
sign-extend the 32-bit orig_eax value when it's stored; it doesn't
change the checks on orig_ax, though it uses the new current_syscall()
inline to better document the subtle importance of the used of
signedness there.

The ax value is stored a lot of ways and it seems hard to get them all
sign-extended at their origins. So for that, we use the
current_syscall_ret() to sign-extend it only for 32-bit tasks at the
time of the -ERESTART* comparisons.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c    |  9 ++++++++-
 arch/x86/kernel/signal_64.c | 38 +++++++++++++++++++++++++++++++++-----
 2 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8f64abe699fd..d5904eef1d31 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1055,10 +1055,17 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
 	R32(esi, si);
 	R32(ebp, bp);
 	R32(eax, ax);
-	R32(orig_eax, orig_ax);
 	R32(eip, ip);
 	R32(esp, sp);
 
+	case offsetof(struct user32, regs.orig_eax):
+		/*
+		 * Sign-extend the value so that orig_eax = -1
+		 * causes (long)orig_ax < 0 tests to fire correctly.
+		 */
+		regs->orig_ax = (long) (s32) value;
+		break;
+
 	case offsetof(struct user32, regs.eflags):
 		return set_flags(child, value);
 
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 56b72fb67f9b..1c83e5124c65 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -310,6 +310,35 @@ give_sigsegv:
 	return -EFAULT;
 }
 
+/*
+ * Return -1L or the syscall number that @regs is executing.
+ */
+static long current_syscall(struct pt_regs *regs)
+{
+	/*
+	 * We always sign-extend a -1 value being set here,
+	 * so this is always either -1L or a syscall number.
+	 */
+	return regs->orig_ax;
+}
+
+/*
+ * Return a value that is -EFOO if the system call in @regs->orig_ax
+ * returned an error.  This only works for @regs from @current.
+ */
+static long current_syscall_ret(struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_thread_flag(TIF_IA32))
+		/*
+		 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+		 * and will match correctly in comparisons.
+		 */
+		return (int) regs->ax;
+#endif
+	return regs->ax;
+}
+
 /*
  * OK, we're invoking a handler
  */	
@@ -327,9 +356,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 #endif
 
 	/* Are we from a system call? */
-	if ((long)regs->orig_ax >= 0) {
+	if (current_syscall(regs) >= 0) {
 		/* If so, check system call restarting.. */
-		switch (regs->ax) {
+		switch (current_syscall_ret(regs)) {
 		        case -ERESTART_RESTARTBLOCK:
 			case -ERESTARTNOHAND:
 				regs->ax = -EINTR;
@@ -426,10 +455,9 @@ static void do_signal(struct pt_regs *regs)
 	}
 
 	/* Did we come from a system call? */
-	if ((long)regs->orig_ax >= 0) {
+	if (current_syscall(regs) >= 0) {
 		/* Restart the system call - no handlers present */
-		long res = regs->ax;
-		switch (res) {
+		switch (current_syscall_ret(regs)) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
-- 
cgit v1.2.2


From 985a34bd75cc8c96e43f00dcdda7c3fdb51a3026 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 9 Mar 2008 13:14:37 +0100
Subject: x86: remove quicklists

quicklists cause a serious memory leak on 32-bit x86,
as documented at:

  http://bugzilla.kernel.org/show_bug.cgi?id=9991

the reason is that the quicklist pool is a special-purpose
cache that grows out of proportion. It is not accounted for
anywhere and users have no way to even realize that it's
the quicklists that are causing RAM usage spikes. It was
supposed to be a relatively small pool, but as demonstrated
by KOSAKI Motohiro, they can grow as large as:

  Quicklists:    1194304 kB

given how much trouble this code has caused historically,
and given that Andrew objected to its introduction on x86
(years ago), the best option at this point is to remove them.

[ any performance benefits of caching constructed pgds should
  be implemented in a more generic way (possibly within the page
  allocator), while still allowing constructed pages to be
  allocated by other workloads. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig         |  3 ---
 arch/x86/mm/pgtable_32.c | 18 +++++++++---------
 2 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f41c9538ca30..237fc128143d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -66,9 +66,6 @@ config MMU
 config ZONE_DMA
 	def_bool y
 
-config QUICKLIST
-	def_bool X86_32
-
 config SBUS
 	bool
 
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 73aba7125203..2f9e9afcb9f4 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -342,12 +342,16 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
+	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 
-	mm->pgd = pgd;		/* so that alloc_pd can use it */
+	/* so that alloc_pd can use it */
+	mm->pgd = pgd;
+	if (pgd)
+		pgd_ctor(pgd);
 
 	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
-		quicklist_free(0, pgd_dtor, pgd);
+		pgd_dtor(pgd);
+		free_page((unsigned long)pgd);
 		pgd = NULL;
 	}
 
@@ -357,12 +361,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	pgd_mop_up_pmds(mm, pgd);
-	quicklist_free(0, pgd_dtor, pgd);
-}
-
-void check_pgt_cache(void)
-{
-	quicklist_trim(0, pgd_dtor, 25, 16);
+	pgd_dtor(pgd);
+	free_page((unsigned long)pgd);
 }
 
 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-- 
cgit v1.2.2