From 6c132b5fe6579ed4b4892c02fe6c05f7e3afc579 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Tue, 27 Sep 2005 21:45:36 -0700
Subject: [PATCH] x86-64: Fix bad assumption that dualcore cpus have synced
 TSCs

This should resolve the issue seen in bugme bug #5105, where it is assumed
that dualcore x86_64 systems have synced TSCs.  This is not the case, and
alternate timesources should be used instead.

For more details, see:
http://bugzilla.kernel.org/show_bug.cgi?id=5105

Andi's earlier concerns that the TSCs should be synced on dualcore systems
have been resolved by confirmation from AMD folks that they can be
unsynced.

Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 2373cb8b8625..703acde2a1a5 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -959,9 +959,6 @@ static __init int unsynchronized_tsc(void)
  	   are handled in the OEM check above. */
  	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
  		return 0;
- 	/* All in a single socket - should be synchronized */
- 	if (cpus_weight(cpu_core_map[0]) == num_online_cpus())
- 		return 0;
 #endif
  	/* Assume multi socket systems are not synchronized */
  	return num_online_cpus() > 1;
-- 
cgit v1.2.2


From 6c654b5fdf093cd05f35f7c9c2a00182fa5636dc Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Thu, 29 Sep 2005 14:42:42 -0700
Subject: [PATCH] swiotlb: move from arch/ia64/lib/ to lib/

The swiotlb implementation is shared by both IA-64 and EM64T. However,
the source itself lives under arch/ia64. This patch moves swiotlb.c
from arch/ia64/lib to lib/ and fixes-up the appropriate Makefile and
Kconfig files. No actual changes are made to swiotlb.c.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86_64/kernel/Makefile | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index bcdd0a805fe7..14328cab5d3a 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -27,7 +27,6 @@ obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
-obj-$(CONFIG_SWIOTLB)		+= swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
 
@@ -41,7 +40,6 @@ CFLAGS_vsyscall.o		:= $(PROFILING) -g0
 bootflag-y			+= ../../i386/kernel/bootflag.o
 cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
 topology-y                     += ../../i386/mach-default/topology.o
-swiotlb-$(CONFIG_SWIOTLB)      += ../../ia64/lib/swiotlb.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
 intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y			+= ../../i386/kernel/quirks.o
-- 
cgit v1.2.2


From 7d318d774789657c37a5e994a4a2cf59d4879ae7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 29 Sep 2005 22:05:55 +0200
Subject: [PATCH] Fix up TLB flush filter disabling

I checked with AMD and they requested to only disable it for family 15.
Also disable it for i386 too. And some style fixes.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 238f73e1a834..257f5ba17902 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -831,8 +831,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 #endif
 }
 
-#define HWCR 0xc0010015
-
 static int __init init_amd(struct cpuinfo_x86 *c)
 {
 	int r;
@@ -841,14 +839,18 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 #ifdef CONFIG_SMP
 	unsigned long value;
 
-	// Disable TLB flush filter by setting HWCR.FFDIS:
-	// bit 6 of msr C001_0015
-	//
-	// Errata 63 for SH-B3 steppings
-	// Errata 122 for all(?) steppings
-	rdmsrl(HWCR, value);
-	value |= 1 << 6;
-	wrmsrl(HWCR, value);
+	/*
+	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
+	 * bit 6 of msr C001_0015
+ 	 *
+	 * Errata 63 for SH-B3 steppings
+	 * Errata 122 for all steppings (F+ have it disabled by default)
+	 */
+	if (c->x86 == 15) {
+		rdmsrl(MSR_K8_HWCR, value);
+		value |= 1 << 6;
+		wrmsrl(MSR_K8_HWCR, value);
+	}
 #endif
 
 	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
-- 
cgit v1.2.2


From 7644143cd6f7e029f3a8ea64f5fb0ab33ec39f72 Mon Sep 17 00:00:00 2001
From: Mike Waychison <mikew@google.com>
Date: Fri, 30 Sep 2005 00:01:27 +0200
Subject: [PATCH] x86_64: Fix mce_log

The attempt to fixup the lockless mce log buffer introduced an infinite loop
when trying to find a free entry.

And:

Using rcu_dereference() to load mcelog.next doesn't seem to be sufficient
enough to ensure that mcelog.next is loaded each time around the loop in
mce_log().  Instead, use an explicit rmb() to ensure that the compiler gets it
right.

AK: turned the smp_wmbs into true wmbs to make sure they are not
reordered by the compiler on UP.

Signed-off-by: Mike Waychison <mikew@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 08203b07f4bd..69541db5ff2c 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -54,9 +54,12 @@ void mce_log(struct mce *mce)
 {
 	unsigned next, entry;
 	mce->finished = 0;
-	smp_wmb();
+	wmb();
 	for (;;) {
 		entry = rcu_dereference(mcelog.next);
+		/* The rmb forces the compiler to reload next in each
+		    iteration */
+		rmb();
 		for (;;) {
 			/* When the buffer fills up discard new entries. Assume
 			   that the earlier errors are the more interesting. */
@@ -69,6 +72,7 @@ void mce_log(struct mce *mce)
 				entry++;
 				continue;
 			}
+			break;
 		}
 		smp_rmb();
 		next = entry + 1;
@@ -76,9 +80,9 @@ void mce_log(struct mce *mce)
 			break;
 	}
 	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
-	smp_wmb();
+	wmb();
 	mcelog.entry[entry].finished = 1;
-	smp_wmb();
+	wmb();
 
 	if (!test_and_set_bit(0, &console_logged))
 		notify_user = 1;
-- 
cgit v1.2.2


From 2dd960d66bc12b6b206e63104636514e5da0ddb7 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin.zhang@intel.com>
Date: Fri, 30 Sep 2005 11:59:20 -0700
Subject: [PATCH] utilization of kprobe_mutex is incorrect on x86_64

The up()/down() orders are incorrect in arch/x86_64/kprobes.c file.
kprobe_mutext is used to protect the free kprobe instruction slot list.
arch_prepare_kprobe applies for a slot from the free list, and
arch_remove_kprobe returns a slot to the free list.  The incorrect up()/down()
orders to operate on kprobe_mutex fail to protect the free list.  If 2 threads
try to get/return kprobe instruction slot at the same time, the free slot list
might be broken, or a free slot might be applied by 2 threads.

Signed-off-by: Zhang Yanmin <Yanmin.zhang@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index df08c43276a0..76a28b007be9 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -77,9 +77,9 @@ static inline int is_IF_modifier(kprobe_opcode_t *insn)
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	/* insn: must be on special executable page on x86_64. */
-	up(&kprobe_mutex);
-	p->ainsn.insn = get_insn_slot();
 	down(&kprobe_mutex);
+	p->ainsn.insn = get_insn_slot();
+	up(&kprobe_mutex);
 	if (!p->ainsn.insn) {
 		return -ENOMEM;
 	}
@@ -231,9 +231,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	up(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
 	down(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn);
+	up(&kprobe_mutex);
 }
 
 static inline void save_previous_kprobe(void)
-- 
cgit v1.2.2


From e6a045a5b89037ae87c8c1bc84403f1d498e52a1 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Fri, 30 Sep 2005 11:59:21 -0700
Subject: [PATCH] x86_64: fix the BP node_to_cpumask

Fix the BP node_to_cpumask.  2.6.14-rc* broke the boot cpu bit as the
cpu_to_node(0) is now not setup early enough for numa_init_array.
cpu_to_node[] is setup much later at srat_detect_node on acpi srat based
em64t machines.  This seems like a problem on amd machines too, Tested on
em64t though.  /sys/devices/system/node/node0/cpumap shows up sanely after
this patch.

Signed off by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/numa.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 80a49d9bd8a7..68ad75853510 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -178,7 +178,6 @@ void __init numa_init_array(void)
 		rr++; 
 	}
 
-	set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
 }
 
 #ifdef CONFIG_NUMA_EMU
@@ -266,9 +265,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 
 __cpuinit void numa_add_cpu(int cpu)
 {
-	/* BP is initialized elsewhere */
-	if (cpu) 
-		set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+	set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
 } 
 
 unsigned long __init numa_free_all_bootmem(void) 
-- 
cgit v1.2.2


From 85cc5135ace4c8b75d7b4e1ea9fe15a7fcbd1516 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Fri, 30 Sep 2005 11:59:22 -0700
Subject: [PATCH] x86_64 early numa init fix

The tests Alok carried out on Petr's box confirmed that cpu_to_node[BP] is
not setup early enough by numa_init_array due to the x86_64 changes in
2.6.14-rc*, and unfortunately set wrongly by the work around code in
numa_init_array().  cpu_to_node[0] gets set with 1 early and later gets set
properly to 0 during identify_cpu() when all cpus are brought up, but
confusing the numa slab in the process.

Here is a quick fix for this.  The right fix obviously is to have
cpu_to_node[bsp] setup early for numa_init_array().  The following patch
will fix the problem now, and the code can stay on even when
cpu_to_node{BP] gets fixed early correctly.

Thanks to Petr for access to his box.

Signed off by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/numa.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 68ad75853510..214803821001 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -167,15 +167,14 @@ void __init numa_init_array(void)
 	   mapping. To avoid this fill in the mapping for all possible
 	   CPUs, as the number of CPUs is not known yet. 
 	   We round robin the existing nodes. */
-	rr = 0;
+	rr = first_node(node_online_map);
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] != NUMA_NO_NODE)
 			continue;
+		cpu_to_node[i] = rr;
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
-		cpu_to_node[i] = rr;
-		rr++; 
 	}
 
 }
-- 
cgit v1.2.2


From ddea7be0ec8d1374f0b483a81566ed56ec9f3905 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Mon, 3 Oct 2005 10:36:28 -0700
Subject: [PATCH] x86_64: Fix numa node topology detection for srat based
 x86_64 boxes

2.6.14-rc2 does not assign cpus to proper nodeids on our em64t numa boxen.
Our boxes use acpi srat for parsing the numa information.

srat_detect_node() used phys_proc_id[] to get to the cpu's local apic id,
but phys_proc_id[] represents the cpu<->initial_apic_id mapping.  The
following patch fixes this problem.  Now apicid_to_node[] is properly
indexed with the local apic id.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 257f5ba17902..cb28df14ff6f 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -967,13 +967,12 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
 static void srat_detect_node(void)
 {
 #ifdef CONFIG_NUMA
-	unsigned apicid, node;
+	unsigned node;
 	int cpu = smp_processor_id();
 
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
-	apicid = phys_proc_id[cpu];
-	node = apicid_to_node[apicid];
+	node = apicid_to_node[hard_smp_processor_id()];
 	if (node == NUMA_NO_NODE)
 		node = 0;
 	cpu_to_node[cpu] = node;
-- 
cgit v1.2.2


From 944d2647dded12e2b05ad8ebc020644bb1997ce1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 5 Oct 2005 00:21:39 +0200
Subject: [PATCH] x86_64: Drop global bit from early low mappings

Drop global bit from early low mappings

Suggested by Linus, originally also proposed by Suresh.

This fixes a race condition with early start of udev, originally
tracked down by Suresh B. Siddha. The problem was that switching
to the user space VM would not clear the global low mappings
for the beginning of memory, which lead to memory corruption.

Drop the global bits.

The kernel mapping stays global because it should stay constant.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/head.S | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 4592bf21fcaf..b92e5f45ed46 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -270,26 +270,26 @@ ENTRY(level3_kernel_pgt)
 .org 0x4000
 ENTRY(level2_ident_pgt)
 	/* 40MB for bootup. 	*/
-	.quad	0x0000000000000183
-	.quad	0x0000000000200183
-	.quad	0x0000000000400183
-	.quad	0x0000000000600183
-	.quad	0x0000000000800183
-	.quad	0x0000000000A00183
-	.quad	0x0000000000C00183
-	.quad	0x0000000000E00183
-	.quad	0x0000000001000183
-	.quad	0x0000000001200183
-	.quad	0x0000000001400183
-	.quad	0x0000000001600183
-	.quad	0x0000000001800183
-	.quad	0x0000000001A00183
-	.quad	0x0000000001C00183
-	.quad	0x0000000001E00183
-	.quad	0x0000000002000183
-	.quad	0x0000000002200183
-	.quad	0x0000000002400183
-	.quad	0x0000000002600183
+	.quad	0x0000000000000083
+	.quad	0x0000000000200083
+	.quad	0x0000000000400083
+	.quad	0x0000000000600083
+	.quad	0x0000000000800083
+	.quad	0x0000000000A00083
+	.quad	0x0000000000C00083
+	.quad	0x0000000000E00083
+	.quad	0x0000000001000083
+	.quad	0x0000000001200083
+	.quad	0x0000000001400083
+	.quad	0x0000000001600083
+	.quad	0x0000000001800083
+	.quad	0x0000000001A00083
+	.quad	0x0000000001C00083
+	.quad	0x0000000001E00083
+	.quad	0x0000000002000083
+	.quad	0x0000000002200083
+	.quad	0x0000000002400083
+	.quad	0x0000000002600083
 	/* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
 	.globl temp_boot_pmds
 temp_boot_pmds:
-- 
cgit v1.2.2


From 3dd083255ddcfa87751fa8e32f61a9547a15a541 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 9 Oct 2005 21:19:40 +0200
Subject: [PATCH] x86_64: Set up safe page tables during resume

The following patch makes swsusp avoid the possible temporary corruption
of page translation tables during resume on x86-64.  This is achieved by
creating a copy of the relevant page tables that will not be modified by
swsusp and can be safely used by it on resume.

The problem is that during resume on x86-64 swsusp may temporarily
corrupt the page tables used for the direct mapping of RAM.  If that
happens, a page fault occurs and cannot be handled properly, which leads
to the solid hang of the affected system.  This leads to the loss of the
system's state from before suspend and may result in the loss of data or
the corruption of filesystems, so it is a serious issue.  Also, it
appears to happen quite often (for me, as often as 50% of the time).

The problem is related to the fact that (at least) one of the PMD
entries used in the direct memory mapping (starting at PAGE_OFFSET)
points to a page table the physical address of which is much greater
than the physical address of the PMD entry itself.  Moreover,
unfortunately, the physical address of the page table before suspend
(i.e.  the one stored in the suspend image) happens to be different to
the physical address of the corresponding page table used during resume
(i.e.  the one that is valid right before swsusp_arch_resume() in
arch/x86_64/kernel/suspend_asm.S is executed).  Thus while the image is
restored, the "offending" PMD entry gets overwritten, so it does not
point to the right physical address any more (i.e.  there's no page
table at the address pointed to by it, because it points to the address
the page table has been at during suspend).  Consequently, if the PMD
entry is used later on, and it _is_ used in the process of copying the
image pages, a page fault occurs, but it cannot be handled in the normal
way and the system hangs.

In principle we can call create_resume_mapping() from
swsusp_arch_resume() (ie.  from suspend_asm.S), but then the memory
allocations in create_resume_mapping(), resume_pud_mapping(), and
resume_pmd_mapping() must be made carefully so that we use _only_
NosaveFree pages in them (the other pages are overwritten by the loop in
swsusp_arch_resume()).  Additionally, we are in atomic context at that
time, so we cannot use GFP_KERNEL.  Moreover, if one of the allocations
fails, we should free all of the allocated pages, so we need to trace
them somehow.

All of this is done in the appended patch, except that the functions
populating the page tables are located in arch/x86_64/kernel/suspend.c
rather than in init.c.  It may be done in a more elegan way in the
future, with the help of some swsusp patches that are in the works now.

[AK: move some externs into headers, renamed a function]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/suspend.c     | 127 +++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/suspend_asm.S |  17 ++++--
 2 files changed, 138 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebb9abf3ce6d..f066c6ab3618 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -11,6 +11,8 @@
 #include <linux/smp.h>
 #include <linux/suspend.h>
 #include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
 
 struct saved_context saved_context;
 
@@ -140,4 +142,129 @@ void fix_processor_context(void)
 
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/* Defined in arch/x86_64/kernel/suspend_asm.S */
+extern int restore_image(void);
 
+pgd_t *temp_level4_pgt;
+
+static void **pages;
+
+static inline void *__add_page(void)
+{
+	void **c;
+
+	c = (void **)get_usable_page(GFP_ATOMIC);
+	if (c) {
+		*c = pages;
+		pages = c;
+	}
+	return c;
+}
+
+static inline void *__next_page(void)
+{
+	void **c;
+
+	c = pages;
+	if (c) {
+		pages = *c;
+		*c = NULL;
+	}
+	return c;
+}
+
+/*
+ * Try to allocate as many usable pages as needed and daisy chain them.
+ * If one allocation fails, free the pages allocated so far
+ */
+static int alloc_usable_pages(unsigned long n)
+{
+	void *p;
+
+	pages = NULL;
+	do
+		if (!__add_page())
+			break;
+	while (--n);
+	if (n) {
+		p = __next_page();
+		while (p) {
+			free_page((unsigned long)p);
+			p = __next_page();
+		}
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+	long i, j;
+
+	i = pud_index(address);
+	pud = pud + i;
+	for (; i < PTRS_PER_PUD; pud++, i++) {
+		unsigned long paddr;
+		pmd_t *pmd;
+
+		paddr = address + i*PUD_SIZE;
+		if (paddr >= end)
+			break;
+
+		pmd = (pmd_t *)__next_page();
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+			unsigned long pe;
+
+			if (paddr >= end)
+				break;
+			pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+			pe &= __supported_pte_mask;
+			set_pmd(pmd, __pmd(pe));
+		}
+	}
+}
+
+static void set_up_temporary_mappings(void)
+{
+	unsigned long start, end, next;
+
+	temp_level4_pgt = (pgd_t *)__next_page();
+
+	/* It is safe to reuse the original kernel mapping */
+	set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+		init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+
+	/* Set up the direct mapping from scratch */
+	start = (unsigned long)pfn_to_kaddr(0);
+	end = (unsigned long)pfn_to_kaddr(end_pfn);
+
+	for (; start < end; start = next) {
+		pud_t *pud = (pud_t *)__next_page();
+		next = start + PGDIR_SIZE;
+		if (next > end)
+			next = end;
+		res_phys_pud_init(pud, __pa(start), __pa(next));
+		set_pgd(temp_level4_pgt + pgd_index(start),
+			mk_kernel_pgd(__pa(pud)));
+	}
+}
+
+int swsusp_arch_resume(void)
+{
+	unsigned long n;
+
+	n = ((end_pfn << PAGE_SHIFT) + PUD_SIZE - 1) >> PUD_SHIFT;
+	n += (n + PTRS_PER_PUD - 1) / PTRS_PER_PUD + 1;
+	pr_debug("swsusp_arch_resume(): pages needed = %lu\n", n);
+	if (alloc_usable_pages(n)) {
+		free_eaten_memory();
+		return -ENOMEM;
+	}
+	/* We have got enough memory and from now on we cannot recover */
+	set_up_temporary_mappings();
+	restore_image();
+	return 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 4d659e97df10..320b6fb00cca 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -39,12 +39,13 @@ ENTRY(swsusp_arch_suspend)
 	call swsusp_save
 	ret
 
-ENTRY(swsusp_arch_resume)
-	/* set up cr3 */	
-	leaq	init_level4_pgt(%rip),%rax
-	subq	$__START_KERNEL_map,%rax
-	movq	%rax,%cr3
-
+ENTRY(restore_image)
+	/* switch to temporary page tables */
+	movq	$__PAGE_OFFSET, %rdx
+	movq	temp_level4_pgt(%rip), %rax
+	subq	%rdx, %rax
+	movq	%rax, %cr3
+	/* Flush TLB */
 	movq	mmu_cr4_features(%rip), %rax
 	movq	%rax, %rdx
 	andq	$~(1<<7), %rdx	# PGE
@@ -69,6 +70,10 @@ loop:
 	movq	pbe_next(%rdx), %rdx
 	jmp	loop
 done:
+	/* go back to the original page tables */
+	leaq	init_level4_pgt(%rip), %rax
+	subq	$__START_KERNEL_map, %rax
+	movq	%rax, %cr3
 	/* Flush TLB, including "global" things (vmalloc) */
 	movq	mmu_cr4_features(%rip), %rax
 	movq	%rax, %rdx
-- 
cgit v1.2.2


From d347f372273c2b3d86a66e2e1c94c790c208e166 Mon Sep 17 00:00:00 2001
From: "Markus F.X.J. Oberhumer" <markus@oberhumer.com>
Date: Sun, 9 Oct 2005 18:54:23 +0200
Subject: [PATCH] i386: fix stack alignment for signal handlers

This fixes the setup of the alignment of the signal frame, so that all
signal handlers are run with a properly aligned stack frame.

The current code "over-aligns" the stack pointer so that the stack frame
is effectively always mis-aligned by 4 bytes.  But what we really want
is that on function entry ((sp + 4) & 15) == 0, which matches what would
happen if the stack were aligned before a "call" instruction.

Signed-off-by: Markus F.X.J. Oberhumer <markus@oberhumer.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/ia32_signal.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index 66e2821533db..0903cc1faef2 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -425,7 +425,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
 		rsp = (unsigned long) ka->sa.sa_restorer;
 	}
 
-	return (void __user *)((rsp - frame_size) & -8UL);
+	rsp -= frame_size;
+	/* Align the stack pointer according to the i386 ABI,
+	 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+	rsp = ((rsp + 4) & -16ul) - 4;
+	return (void __user *) rsp;
 }
 
 int ia32_setup_frame(int sig, struct k_sigaction *ka,
-- 
cgit v1.2.2


From 094804c5a132f04c12dd4902ee15c64362e5c1af Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 11 Oct 2005 01:03:39 +0200
Subject: [PATCH] x86_64: Fix change_page_attr cache flushing

Noticed by Terence Ripperda

Undo wrong change in global_flush_tlb. We need to flush the caches in all
cases, not just when pages were reverted. This was a bogus optimization
added earlier, but it was wrong.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/pageattr.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 94862e1ec032..b90e8fe9eeb0 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -220,8 +220,6 @@ void global_flush_tlb(void)
 	down_read(&init_mm.mmap_sem);
 	df = xchg(&df_list, NULL);
 	up_read(&init_mm.mmap_sem);
-	if (!df)
-		return;
 	flush_map((df && !df->next) ? df->address : 0);
 	for (; df; df = next_df) { 
 		next_df = df->next;
-- 
cgit v1.2.2


From 421c7ce6d001fce28b1fa8fdd2e7ded0ed8a0ad5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 10 Oct 2005 22:32:45 +0200
Subject: [PATCH] x86_64: Allocate cpu local data for all possible CPUs

CPU hotplug fills up the possible map to NR_CPUs, but it did that after
setting up per CPU data. This lead to CPU data not getting allocated
for all possible CPUs, which lead to various side effects.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup64.c | 4 ++++
 arch/x86_64/kernel/smpboot.c | 6 +-----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index bd33be24a386..79190891fbc5 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -87,6 +87,10 @@ void __init setup_per_cpu_areas(void)
 	int i;
 	unsigned long size;
 
+#ifdef CONFIG_HOTPLUG_CPU
+	prefill_possible_map();
+#endif
+
 	/* Copy section for each CPU (we discard the original) */
 	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
 #ifdef CONFIG_MODULES
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index e12d7baeb33e..658a81b33f3b 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -892,7 +892,7 @@ static __init void disable_smp(void)
  * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
  * - Ashok Raj
  */
-static void prefill_possible_map(void)
+__init void prefill_possible_map(void)
 {
 	int i;
 	for (i = 0; i < NR_CPUS; i++)
@@ -967,10 +967,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	current_cpu_data = boot_cpu_data;
 	current_thread_info()->cpu = 0;  /* needed? */
 
-#ifdef CONFIG_HOTPLUG_CPU
-	prefill_possible_map();
-#endif
-
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
 		disable_smp();
-- 
cgit v1.2.2


From f80aabb03a33702d934fbc3c02fd96471816d82e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 21 Oct 2005 03:21:43 -0400
Subject: [PATCH] gfp_t: dma-mapping (amd64)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-gart.c  | 4 ++--
 arch/x86_64/kernel/pci-nommu.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index cf0a0315d586..88be97c96987 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -187,7 +187,7 @@ static void flush_gart(struct device *dev)
 
 /* Allocate DMA memory on node near device */
 noinline
-static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
+static void *dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
 {
 	struct page *page;
 	int node;
@@ -204,7 +204,7 @@ static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
  */
 void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   unsigned gfp)
+		   gfp_t gfp)
 {
 	void *memory;
 	unsigned long dma_mask = 0;
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 67d90b89af0b..5a981dca87ff 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -24,7 +24,7 @@ EXPORT_SYMBOL(iommu_sac_force);
  */
 
 void *dma_alloc_coherent(struct device *hwdev, size_t size,
-			 dma_addr_t *dma_handle, unsigned gfp)
+			 dma_addr_t *dma_handle, gfp_t gfp)
 {
 	void *ret;
 	u64 mask;
-- 
cgit v1.2.2


From 404351e67a9facb475abf1492245374a28d13e90 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:04 -0700
Subject: [PATCH] mm: mm_init set_mm_counters

How is anon_rss initialized?  In dup_mmap, and by mm_alloc's memset; but
that's not so good if an mm_counter_t is a special type.  And how is rss
initialized?  By set_mm_counter, all over the place.  Come on, we just need to
initialize them both at once by set_mm_counter in mm_init (which follows the
memcpy when forking).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/ia32_aout.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 3e6780fa0186..93c60f4aa47a 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
-- 
cgit v1.2.2


From 872fec16d9a0ed3b75b8893aa217e49cca575ee5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:21 -0700
Subject: [PATCH] mm: init_mm without ptlock

First step in pushing down the page_table_lock.  init_mm.page_table_lock has
been used throughout the architectures (usually for ioremap): not to serialize
kernel address space allocation (that's usually vmlist_lock), but because
pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it.

Reverse that: don't lock or unlock init_mm.page_table_lock in any of the
architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take
and drop it when allocating a new one, to check lest a racing task already
did.  Similarly no page_table_lock in vmalloc's map_vm_area.

Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle
user mms, which are converted only by a later patch, for now they have to lock
differently according to whether or not it's init_mm.

If sources get muddled, there's a danger that an arch source taking
init_mm.page_table_lock will be mixed with common source also taking it (or
neither take it).  So break the rules and make another change, which should
break the build for such a mismatch: remove the redundant mm arg from
pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13).

Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64
used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to
pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64
map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free
took page_table_lock for no good reason.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/ioremap.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 6972df480d2b..ecf7acb5db9b 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -105,7 +105,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pud = pud_alloc(&init_mm, pgd, address);
@@ -119,7 +118,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgd++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }
-- 
cgit v1.2.2


From dacb16b1a034fa7a0b868ee30758119fbfd90bc1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Sun, 30 Oct 2005 14:59:25 -0800
Subject: [PATCH] i386 and x86_64 TSC set_cyc2ns_scale imprecision

I just found out that some precision is unnecessarily lost in the
arch/i386/kernel/timers/timer_tsc.c:set_cyc2ns_scale function.  It uses a
cpu_mhz parameter when it could use a cpu_khz.  In the specific case of an
Intel P4 running at 3001.171 Mhz, the truncation to 3001 Mhz leads to an
imprecision of 19 microseconds per second : this is very sad for a timer with
nearly nanosecond accuracy.

Fix the x86_64 architecture too.

Cc: george anzinger <george@mvista.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 703acde2a1a5..47d25ad08160 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -481,9 +481,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 static unsigned int cyc2ns_scale;
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
-static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 {
-	cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
+	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
 }
 
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
@@ -655,7 +655,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 			vxtime.tsc_quot = (1000L << 32) / cpu_khz;
 	}
 	
-	set_cyc2ns_scale(cpu_khz_ref / 1000);
+	set_cyc2ns_scale(cpu_khz_ref);
 
 	return 0;
 }
@@ -939,7 +939,7 @@ void __init time_init(void)
 	rdtscll_sync(&vxtime.last_tsc);
 	setup_irq(0, &irq0);
 
-	set_cyc2ns_scale(cpu_khz / 1000);
+	set_cyc2ns_scale(cpu_khz);
 
 #ifndef CONFIG_SMP
 	time_init_gtod();
-- 
cgit v1.2.2


From 08967f941ad897b2f7c2f99e886c75d6319e5087 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sun, 30 Oct 2005 14:59:28 -0800
Subject: [PATCH] FPU context corrupted after resume

mxcsr_feature_mask_init isn't needed in suspend/resume time (we can use
boot time mask).  And actually it's harmful, as it clear task's saved
fxsave in resume.  This bug is widely seen by users using zsh.

(akpm: my eyes.  Fixed some surrounding whitespace mess)

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/suspend.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index f066c6ab3618..02516823f514 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -63,13 +63,12 @@ void save_processor_state(void)
 	__save_processor_state(&saved_context);
 }
 
-static void
-do_fpu_end(void)
+static void do_fpu_end(void)
 {
-        /* restore FPU regs if necessary */
-	/* Do it out of line so that gcc does not move cr0 load to some stupid place */
-        kernel_fpu_end();
-	mxcsr_feature_mask_init();
+	/*
+	 * Restore FPU regs if necessary
+	 */
+	kernel_fpu_end();
 }
 
 void __restore_processor_state(struct saved_context *ctxt)
-- 
cgit v1.2.2


From daedb82d6b54e58a66ad1dce3509e699a5bd1b18 Mon Sep 17 00:00:00 2001
From: "Kamble, Nitin A" <nitin.a.kamble@intel.com>
Date: Sun, 30 Oct 2005 14:59:43 -0800
Subject: [PATCH] x86: vmx cpu feature detection

If VMX feature is available in the CPU, this patch will make it visible in
the /proc/cpuinfo with the cpuid detection.

Signed-Off-By: Nitin A Kamble <nitin.a.kamble@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index cb28df14ff6f..da0bc3e7bdf5 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -1213,7 +1213,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* Intel-defined (#2) */
-		"pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est",
+		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est",
 		"tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-- 
cgit v1.2.2


From c53117815771e1e84e6ba80a42fa1f8e330adb4d Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Sun, 30 Oct 2005 14:59:44 -0800
Subject: [PATCH] Clean up mtrr compat ioctl code

Handle 32-bit mtrr ioctls in the mtrr driver instead of the ia32
compatability layer.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/ia32_ioctl.c | 96 -------------------------------------------
 1 file changed, 96 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
index 419758f19ca4..0ad5cc33b45a 100644
--- a/arch/x86_64/ia32/ia32_ioctl.c
+++ b/arch/x86_64/ia32/ia32_ioctl.c
@@ -12,7 +12,6 @@
 #define INCLUDES
 #include <linux/syscalls.h>
 #include "compat_ioctl.c"
-#include <asm/mtrr.h>
 #include <asm/ia32.h>
 
 #define CODE
@@ -85,90 +84,6 @@ static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
 	return sys_ioctl(fd,cmd,arg); 
 } 
 
-/* /proc/mtrr ioctls */
-
-
-struct mtrr_sentry32
-{
-    compat_ulong_t base;    /*  Base address     */
-    compat_uint_t size;    /*  Size of region   */
-    compat_uint_t type;     /*  Type of region   */
-};
-
-struct mtrr_gentry32
-{
-    compat_ulong_t regnum;   /*  Register number  */
-    compat_uint_t base;    /*  Base address     */
-    compat_uint_t size;    /*  Size of region   */
-    compat_uint_t type;     /*  Type of region   */
-};
-
-#define	MTRR_IOCTL_BASE	'M'
-
-#define MTRRIOC32_ADD_ENTRY        _IOW(MTRR_IOCTL_BASE,  0, struct mtrr_sentry32)
-#define MTRRIOC32_SET_ENTRY        _IOW(MTRR_IOCTL_BASE,  1, struct mtrr_sentry32)
-#define MTRRIOC32_DEL_ENTRY        _IOW(MTRR_IOCTL_BASE,  2, struct mtrr_sentry32)
-#define MTRRIOC32_GET_ENTRY        _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
-#define MTRRIOC32_KILL_ENTRY       _IOW(MTRR_IOCTL_BASE,  4, struct mtrr_sentry32)
-#define MTRRIOC32_ADD_PAGE_ENTRY   _IOW(MTRR_IOCTL_BASE,  5, struct mtrr_sentry32)
-#define MTRRIOC32_SET_PAGE_ENTRY   _IOW(MTRR_IOCTL_BASE,  6, struct mtrr_sentry32)
-#define MTRRIOC32_DEL_PAGE_ENTRY   _IOW(MTRR_IOCTL_BASE,  7, struct mtrr_sentry32)
-#define MTRRIOC32_GET_PAGE_ENTRY   _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
-#define MTRRIOC32_KILL_PAGE_ENTRY  _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry32)
-
-
-static int mtrr_ioctl32(unsigned int fd, unsigned int cmd, unsigned long arg)
-{ 
-	struct mtrr_gentry g;
-	struct mtrr_sentry s;
-	int get = 0, err = 0; 
-	struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)arg; 
-	mm_segment_t oldfs = get_fs(); 
-
-	switch (cmd) { 
-#define SET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; break 
-#define GET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; get=1; break
-		SET(ADD);
-		SET(SET); 
-		SET(DEL);
-		GET(GET); 
-		SET(KILL);
-		SET(ADD_PAGE); 
-		SET(SET_PAGE); 
-		SET(DEL_PAGE); 
-		GET(GET_PAGE); 
-		SET(KILL_PAGE); 
-	} 
-	
-	if (get) { 
-		err = get_user(g.regnum, &g32->regnum);
-		err |= get_user(g.base, &g32->base);
-		err |= get_user(g.size, &g32->size);
-		err |= get_user(g.type, &g32->type); 
-
-		arg = (unsigned long)&g; 
-	} else { 
-		struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)arg;
-		err = get_user(s.base, &s32->base);
-		err |= get_user(s.size, &s32->size);
-		err |= get_user(s.type, &s32->type);
-
-		arg = (unsigned long)&s; 
-	} 
-	if (err) return err;
-	
-	set_fs(KERNEL_DS); 
-	err = sys_ioctl(fd, cmd, arg); 
-	set_fs(oldfs); 
-		
-	if (!err && get) { 
-		err = put_user(g.base, &g32->base);
-		err |= put_user(g.size, &g32->size);
-		err |= put_user(g.regnum, &g32->regnum);
-		err |= put_user(g.type, &g32->type); 
-	} 
-	return err;
-} 
 
 #define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) }, 
 #define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
@@ -193,17 +108,6 @@ HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl)
 /* take care of sizeof(sizeof()) breakage */
-/* mtrr */
-HANDLE_IOCTL(MTRRIOC32_ADD_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_SET_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_DEL_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_GET_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_KILL_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_ADD_PAGE_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_SET_PAGE_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_DEL_PAGE_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_GET_PAGE_ENTRY, mtrr_ioctl32)
-HANDLE_IOCTL(MTRRIOC32_KILL_PAGE_ENTRY, mtrr_ioctl32)
 }; 
 
 int ioctl_table_size = ARRAY_SIZE(ioctl_start);
-- 
cgit v1.2.2


From 2c1b4a5ca48831595979a850f40ced8e7da026f8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:58 -0800
Subject: [PATCH] swsusp: rework memory freeing on resume

The following patch makes swsusp use the PG_nosave and PG_nosave_free flags to
mark pages that should be freed in case of an error during resume.

This allows us to simplify the code and to use swsusp_free() in all of the
swsusp's resume error paths, which makes them actually work.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/suspend.c | 84 ++++++++++----------------------------------
 1 file changed, 19 insertions(+), 65 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index 02516823f514..fd2bef780882 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -147,57 +147,7 @@ extern int restore_image(void);
 
 pgd_t *temp_level4_pgt;
 
-static void **pages;
-
-static inline void *__add_page(void)
-{
-	void **c;
-
-	c = (void **)get_usable_page(GFP_ATOMIC);
-	if (c) {
-		*c = pages;
-		pages = c;
-	}
-	return c;
-}
-
-static inline void *__next_page(void)
-{
-	void **c;
-
-	c = pages;
-	if (c) {
-		pages = *c;
-		*c = NULL;
-	}
-	return c;
-}
-
-/*
- * Try to allocate as many usable pages as needed and daisy chain them.
- * If one allocation fails, free the pages allocated so far
- */
-static int alloc_usable_pages(unsigned long n)
-{
-	void *p;
-
-	pages = NULL;
-	do
-		if (!__add_page())
-			break;
-	while (--n);
-	if (n) {
-		p = __next_page();
-		while (p) {
-			free_page((unsigned long)p);
-			p = __next_page();
-		}
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
 {
 	long i, j;
 
@@ -211,7 +161,9 @@ static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long e
 		if (paddr >= end)
 			break;
 
-		pmd = (pmd_t *)__next_page();
+		pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+		if (!pmd)
+			return -ENOMEM;
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
 			unsigned long pe;
@@ -223,13 +175,17 @@ static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long e
 			set_pmd(pmd, __pmd(pe));
 		}
 	}
+	return 0;
 }
 
-static void set_up_temporary_mappings(void)
+static int set_up_temporary_mappings(void)
 {
 	unsigned long start, end, next;
+	int error;
 
-	temp_level4_pgt = (pgd_t *)__next_page();
+	temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
+	if (!temp_level4_pgt)
+		return -ENOMEM;
 
 	/* It is safe to reuse the original kernel mapping */
 	set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
@@ -240,29 +196,27 @@ static void set_up_temporary_mappings(void)
 	end = (unsigned long)pfn_to_kaddr(end_pfn);
 
 	for (; start < end; start = next) {
-		pud_t *pud = (pud_t *)__next_page();
+		pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+		if (!pud)
+			return -ENOMEM;
 		next = start + PGDIR_SIZE;
 		if (next > end)
 			next = end;
-		res_phys_pud_init(pud, __pa(start), __pa(next));
+		if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+			return error;
 		set_pgd(temp_level4_pgt + pgd_index(start),
 			mk_kernel_pgd(__pa(pud)));
 	}
+	return 0;
 }
 
 int swsusp_arch_resume(void)
 {
-	unsigned long n;
+	int error;
 
-	n = ((end_pfn << PAGE_SHIFT) + PUD_SIZE - 1) >> PUD_SHIFT;
-	n += (n + PTRS_PER_PUD - 1) / PTRS_PER_PUD + 1;
-	pr_debug("swsusp_arch_resume(): pages needed = %lu\n", n);
-	if (alloc_usable_pages(n)) {
-		free_eaten_memory();
-		return -ENOMEM;
-	}
 	/* We have got enough memory and from now on we cannot recover */
-	set_up_temporary_mappings();
+	if ((error = set_up_temporary_mappings()))
+		return error;
 	restore_image();
 	return 0;
 }
-- 
cgit v1.2.2


From a8db2db1e6a8d323d87a67c5391d48fe2b97faf5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:01:38 -0800
Subject: [PATCH] introduce setup_timer() helper

Every user of init_timer() also needs to initialize ->function and ->data
fields.  This patch adds a simple setup_timer() helper for that.

The schedule_timeout() is patched as an example of usage.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/i8259.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index b2a238b5a17e..c6c9791d77c1 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -494,7 +494,7 @@ void invalidate_interrupt7(void);
 void thermal_interrupt(void);
 void i8254_timer_resume(void);
 
-static void setup_timer(void)
+static void setup_timer_hardware(void)
 {
 	outb_p(0x34,0x43);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
@@ -505,13 +505,13 @@ static void setup_timer(void)
 
 static int timer_resume(struct sys_device *dev)
 {
-	setup_timer();
+	setup_timer_hardware();
 	return 0;
 }
 
 void i8254_timer_resume(void)
 {
-	setup_timer();
+	setup_timer_hardware();
 }
 
 static struct sysdev_class timer_sysclass = {
@@ -594,7 +594,7 @@ void __init init_IRQ(void)
 	 * Set the clock to HZ Hz, we already have a valid
 	 * vector now:
 	 */
-	setup_timer();
+	setup_timer_hardware();
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
-- 
cgit v1.2.2


From 371e8c25b65f2fe7942868a8a67129d571e94076 Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Sun, 30 Oct 2005 15:03:00 -0800
Subject: [PATCH] Remove orphaned TIOCGDEV compat ioctl

This ioctl doesn't exist for native i386.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/ia32_ioctl.c | 29 -----------------------------
 1 file changed, 29 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
index 0ad5cc33b45a..4ba0e293d5e5 100644
--- a/arch/x86_64/ia32/ia32_ioctl.c
+++ b/arch/x86_64/ia32/ia32_ioctl.c
@@ -17,34 +17,6 @@
 #define CODE
 #include "compat_ioctl.c"
   
-#ifndef TIOCGDEV
-#define TIOCGDEV       _IOR('T',0x32, unsigned int)
-#endif
-static int tiocgdev(unsigned fd, unsigned cmd,  unsigned int __user *ptr) 
-{ 
-
-	struct file *file;
-	struct tty_struct *real_tty;
-	int fput_needed, ret;
-
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		return -EBADF;
-
-	ret = -EINVAL;
-	if (file->f_op->ioctl != tty_ioctl)
-		goto out;
-	real_tty = (struct tty_struct *)file->private_data;
-	if (!real_tty) 	
-		goto out;
-
-	ret = put_user(new_encode_dev(tty_devnum(real_tty)), ptr); 
-
-out:
-	fput_light(file, fput_needed);
-	return ret;
-} 
-
 #define RTC_IRQP_READ32	_IOR('p', 0x0b, unsigned int)	 /* Read IRQ rate   */
 #define RTC_IRQP_SET32	_IOW('p', 0x0c, unsigned int)	 /* Set IRQ rate    */
 #define RTC_EPOCH_READ32	_IOR('p', 0x0d, unsigned)	 /* Read epoch      */
@@ -100,7 +72,6 @@ COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain *
 COMPATIBLE_IOCTL(FIOQSIZE)
 
 /* And these ioctls need translation */
-HANDLE_IOCTL(TIOCGDEV, tiocgdev)
 /* realtime device */
 HANDLE_IOCTL(RTC_IRQP_READ,  rtc32_ioctl)
 HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl)
-- 
cgit v1.2.2


From ecea8d19c9f0ebd62ddaa07fc919ff4e4b820d99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 30 Oct 2005 15:03:00 -0800
Subject: [PATCH] jiffies_64 cleanup

Define jiffies_64 in kernel/timer.c rather than having 24 duplicated
defines in each architecture.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 47d25ad08160..bd5ea09bfdf9 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -42,10 +42,6 @@
 #include <asm/apic.h>
 #endif
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #ifdef CONFIG_CPU_FREQ
 static void cpufreq_delayed_get(void);
 #endif
-- 
cgit v1.2.2


From f00c96f313b07d2eb2845305b9a3395e14385767 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Sun, 30 Oct 2005 15:03:35 -0800
Subject: [PATCH] hpet-RTC: disable interrupt when no longer needed

When the emulated RTC interrupt is no longer needed, we better disable it;
otherwise, we get a spurious interrupt whenever the timer has rolled over and
reaches the same comparator value.

Having a superfluous interrupt every five minutes doesn't hurt much, but it's
bad style anyway.  ;-)

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Acked-by: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index bd5ea09bfdf9..7037ca668c63 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1138,8 +1138,12 @@ static void hpet_rtc_timer_reinit(void)
 {
 	unsigned int cfg, cnt;
 
-	if (!(PIE_on | AIE_on | UIE_on))
+	if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+		cfg = hpet_readl(HPET_T1_CFG);
+		cfg &= ~HPET_TN_ENABLE;
+		hpet_writel(cfg, HPET_T1_CFG);
 		return;
+	}
 
 	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
 		hpet_rtc_int_freq = PIE_freq;
-- 
cgit v1.2.2


From 5f819949ee4e5a06c2e0054cbb42f3f0d170d779 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Sun, 30 Oct 2005 15:03:36 -0800
Subject: [PATCH] hpet-RTC: fix timer config register accesses

Make sure that the RTC timer is in non-periodic mode; some stupid BIOS might
have initialized it to periodic mode.

Furthermore, don't set the SETVAL bit in the config register.  This wouldn't
have any effect unless the timer was in period mode (which it isn't), and then
the actual timer frequency would be half that of the desired one because
incrementing the comparator in the interrupt handler would be done after the
hardware has already incremented it itself.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7037ca668c63..35a896ef4c35 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1128,7 +1128,8 @@ int hpet_rtc_timer_init(void)
 	local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
-	cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
 	hpet_writel(cfg, HPET_T1_CFG);
 
 	return 1;
@@ -1154,12 +1155,6 @@ static void hpet_rtc_timer_reinit(void)
 	cnt = hpet_readl(HPET_T1_CMP);
 	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
 	hpet_writel(cnt, HPET_T1_CMP);
-
-	cfg = hpet_readl(HPET_T1_CFG);
-	cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
-	hpet_writel(cfg, HPET_T1_CFG);
-
-	return;
 }
 
 /*
-- 
cgit v1.2.2


From 7811fb8f400a3dbfa027d86bb583a31c66fddfc3 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Sun, 30 Oct 2005 15:03:36 -0800
Subject: [PATCH] hpet-RTC: cache the comparator register

Reads from an HPET register require a round trip to the south bridge and are
almost as slow as PCI reads.  By caching the last value we've written to the
comparator register, we can eliminate all HPET reads from the fast path in the
emulated RTC interrupt handler.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 35a896ef4c35..fdaddc4e5284 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1089,6 +1089,7 @@ static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
 static unsigned long PIE_count;
 
 static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
 
 int is_hpet_enabled(void)
 {
@@ -1125,6 +1126,7 @@ int hpet_rtc_timer_init(void)
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
 	hpet_writel(cnt, HPET_T1_CMP);
+	hpet_t1_cmp = cnt;
 	local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
@@ -1152,9 +1154,10 @@ static void hpet_rtc_timer_reinit(void)
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
 	/* It is more accurate to use the comparator value than current count.*/
-	cnt = hpet_readl(HPET_T1_CMP);
+	cnt = hpet_t1_cmp;
 	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
 	hpet_writel(cnt, HPET_T1_CMP);
+	hpet_t1_cmp = cnt;
 }
 
 /*
-- 
cgit v1.2.2


From 06024f217d607369f0ee0071034ebb03071d5fb2 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <aoliva@redhat.com>
Date: Mon, 31 Oct 2005 18:29:36 -0200
Subject: [PATCH] x86-64: bitops fix for -Os

This fixes the x86-64 find_[first|next]_zero_bit() function for the
end-of-range case.  It didn't test for a zero size, and the "rep scas"
would do entirely the wrong thing.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/lib/bitops.c | 66 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 16 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
index a29fb75b33ac..95b6d9639fba 100644
--- a/arch/x86_64/lib/bitops.c
+++ b/arch/x86_64/lib/bitops.c
@@ -5,19 +5,23 @@
 #undef find_first_bit
 #undef find_next_bit
 
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+static inline long
+__find_first_zero_bit(const unsigned long * addr, unsigned long size)
 {
 	long d0, d1, d2;
 	long res;
 
+	/*
+	 * We must test the size in words, not in bits, because
+	 * otherwise incoming sizes in the range -63..-1 will not run
+	 * any scasq instructions, and then the flags used by the je
+	 * instruction will have whatever random value was in place
+	 * before.  Nobody should call us like that, but
+	 * find_next_zero_bit() does when offset and size are at the
+	 * same word and it fails to find a zero itself.
+	 */
+	size += 63;
+	size >>= 6;
 	if (!size)
 		return 0;
 	asm volatile(
@@ -30,11 +34,29 @@ inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
 		"  shlq $3,%%rdi\n"
 		"  addq %%rdi,%%rdx"
 		:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
-		:"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
-		 [addr] "r" (addr) : "memory");
+		:"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
+		 [addr] "S" (addr) : "memory");
+	/*
+	 * Any register would do for [addr] above, but GCC tends to
+	 * prefer rbx over rsi, even though rsi is readily available
+	 * and doesn't have to be saved.
+	 */
 	return res;
 }
 
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+	return __find_first_zero_bit (addr, size);
+}
+
 /**
  * find_next_zero_bit - find the first zero bit in a memory region
  * @addr: The address to base the search on
@@ -43,7 +65,7 @@ inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
  */
 long find_next_zero_bit (const unsigned long * addr, long size, long offset)
 {
-	unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
+	const unsigned long * p = addr + (offset >> 6);
 	unsigned long set = 0;
 	unsigned long res, bit = offset&63;
 
@@ -63,8 +85,8 @@ long find_next_zero_bit (const unsigned long * addr, long size, long offset)
 	/*
 	 * No zero yet, search remaining full words for a zero
 	 */
-	res = find_first_zero_bit ((const unsigned long *)p,
-				   size - 64 * (p - (unsigned long *) addr));
+	res = __find_first_zero_bit (p, size - 64 * (p - addr));
+
 	return (offset + set + res);
 }
 
@@ -74,6 +96,19 @@ __find_first_bit(const unsigned long * addr, unsigned long size)
 	long d0, d1;
 	long res;
 
+	/*
+	 * We must test the size in words, not in bits, because
+	 * otherwise incoming sizes in the range -63..-1 will not run
+	 * any scasq instructions, and then the flags used by the jz
+	 * instruction will have whatever random value was in place
+	 * before.  Nobody should call us like that, but
+	 * find_next_bit() does when offset and size are at the same
+	 * word and it fails to find a one itself.
+	 */
+	size += 63;
+	size >>= 6;
+	if (!size)
+		return 0;
 	asm volatile(
 		"   repe; scasq\n"
 		"   jz 1f\n"
@@ -83,8 +118,7 @@ __find_first_bit(const unsigned long * addr, unsigned long size)
 		"   shlq $3,%%rdi\n"
 		"   addq %%rdi,%%rax"
 		:"=a" (res), "=&c" (d0), "=&D" (d1)
-		:"0" (0ULL),
-		 "1" ((size + 63) >> 6), "2" (addr),
+		:"0" (0ULL), "1" (size), "2" (addr),
 		 [addr] "r" (addr) : "memory");
 	return res;
 }
-- 
cgit v1.2.2


From cd6b0762a04978baf48412456a687842de97e381 Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Mon, 7 Nov 2005 00:59:14 -0800
Subject: [PATCH] Move Kprobes and Oprofile to "Instrumentation Support" menu

Andrew Morton suggested to move kprobes from kernel hacking menu, since
kernel hacking menu is in-appropriate for the Kprobes.  This patch moves
Kprobes and Oprofile under instrumentation menu.

(akpm: it's not a natural fit, but things like djprobes and the s390 guys'
statistics library need a home)

Signed-of-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Philippe Elie <phil.el@wanadoo.fr>
Cc: John Levon <levon@movementarian.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig          | 13 +++++++++++++
 arch/x86_64/Kconfig.debug    | 10 ----------
 arch/x86_64/oprofile/Kconfig |  6 ------
 3 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 21afa69a086d..4cce2f6f170c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -532,8 +532,21 @@ source "drivers/firmware/Kconfig"
 
 source fs/Kconfig
 
+menu "Instrumentation Support"
+        depends on EXPERIMENTAL
+
 source "arch/x86_64/oprofile/Kconfig"
 
+config KPROBES
+	bool "Kprobes (EXPERIMENTAL)"
+	help
+	  Kprobes allows you to trap at almost any kernel address and
+	  execute a callback function.  register_kprobe() establishes
+	  a probepoint and specifies the callback.  Kprobes is useful
+	  for kernel debugging, non-intrusive instrumentation and testing.
+	  If in doubt, say "N".
+endmenu
+
 source "arch/x86_64/Kconfig.debug"
 
 source "security/Kconfig"
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index 9cf1410d2f5a..d584ecc27ea1 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -33,16 +33,6 @@ config IOMMU_DEBUG
 	 options. See Documentation/x86_64/boot-options.txt for more
 	 details.
 
-config KPROBES
-	bool "Kprobes"
-	depends on DEBUG_KERNEL
-	help
-	  Kprobes allows you to trap at almost any kernel address and
-	  execute a callback function.  register_kprobe() establishes
-	  a probepoint and specifies the callback.  Kprobes is useful
-	  for kernel debugging, non-intrusive instrumentation and testing.
-	  If in doubt, say "N".
-
 config IOMMU_LEAK
        bool "IOMMU leak tracing"
        depends on DEBUG_KERNEL
diff --git a/arch/x86_64/oprofile/Kconfig b/arch/x86_64/oprofile/Kconfig
index 5ade19801b97..d8a84088471a 100644
--- a/arch/x86_64/oprofile/Kconfig
+++ b/arch/x86_64/oprofile/Kconfig
@@ -1,7 +1,3 @@
-
-menu "Profiling support"
-	depends on EXPERIMENTAL
-
 config PROFILING
 	bool "Profiling support (EXPERIMENTAL)"
 	help
@@ -19,5 +15,3 @@ config OPROFILE
 
 	  If unsure, say N.
 
-endmenu
-
-- 
cgit v1.2.2


From 481bed454247538e9f57d4ea37b153ccba24ba7b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Nov 2005 00:59:47 -0800
Subject: [PATCH] consolidate sys_ptrace()

The sys_ptrace boilerplate code (everything outside the big switch
statement for the arch-specific requests) is shared by most architectures.
This patch moves it to kernel/ptrace.c and leaves the arch-specific code as
arch_ptrace.

Some architectures have a too different ptrace so we have to exclude them.
They continue to keep their implementations.  For sh64 I had to add a
sh64_ptrace wrapper because it does some initialization on the first call.
For um I removed an ifdefed SUBARCH_PTRACE_SPECIAL block, but
SUBARCH_PTRACE_SPECIAL isn't defined anywhere in the tree.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Paul Mackerras <paulus@samba.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-By: David Howells <dhowells@redhat.com>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/ptrace.c | 43 +------------------------------------------
 1 file changed, 1 insertion(+), 42 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index bbf64b59a21e..a87b6cebe80f 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -313,48 +313,11 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 
 }
 
-asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	long i, ret;
 	unsigned ui;
 
-	/* This lock_kernel fixes a subtle race with suid exec */
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-	ret = ptrace_check_attach(child, request == PTRACE_KILL); 
-	if (ret < 0) 
-		goto out_tsk;
-
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
@@ -608,10 +571,6 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.2


From 66ff2d0691e00e1e7bfdf398a970310c9a0fe671 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:07 -0800
Subject: [PATCH] Kprobes: rearrange preempt_disable/enable() calls

The following set of patches are aimed at improving kprobes scalability.  We
currently serialize kprobe registration, unregistration and handler execution
using a single spinlock - kprobe_lock.

With these changes, kprobe handlers can run without any locks held.  It also
allows for simultaneous kprobe handler executions on different processors as
we now track kprobe execution on a per processor basis.  It is now necessary
that the handlers be re-entrant since handlers can run concurrently on
multiple processors.

All changes have been tested on i386, ia64, ppc64 and x86_64, while sparc64
has been compile tested only.

The patches can be viewed as 3 logical chunks:

patch 1: 	Reorder preempt_(dis/en)able calls
patches 2-7: 	Introduce per_cpu data areas to track kprobe execution
patches 8-9: 	Use RCU to synchronize kprobe (un)registration and handler
		execution.

Thanks to Maneesh Soni, James Keniston and Anil Keshavamurthy for their
review and suggestions. Thanks again to Anil, Hien Nguyen and Kevin Stafford
for testing the patches.

This patch:

Reorder preempt_disable/enable() calls in arch kprobes files in preparation to
introduce locking changes.  No functional changes introduced by this patch.

Signed-off-by: Ananth N Mavinakayahanalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 76a28b007be9..ebfa2c9241ca 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -302,9 +302,6 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 	int ret = 0;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
 
-	/* We're in an interrupt, but this is clear and BUG()-safe. */
-	preempt_disable();
-
 	/* Check we're not actually recursing */
 	if (kprobe_running()) {
 		/* We *are* holding lock here, so this is safe.
@@ -372,6 +369,11 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		goto no_kprobe;
 	}
 
+	/*
+	 * This preempt_disable() matches the preempt_enable_no_resched()
+	 * in post_kprobe_handler()
+	 */
+	preempt_disable();
 	kprobe_status = KPROBE_HIT_ACTIVE;
 	set_current_kprobe(p, regs);
 
@@ -385,7 +387,6 @@ ss_probe:
 	return 1;
 
 no_kprobe:
-	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -456,7 +457,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
         /*
          * By returning a non-zero value, we are telling
          * kprobe_handler() that we have handled unlocking
-         * and re-enabling preemption.
+	 * and re-enabling preemption
          */
         return 1;
 }
@@ -599,29 +600,29 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 				       unsigned long val, void *data)
 {
 	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	preempt_disable();
 	switch (val) {
 	case DIE_INT3:
 		if (kprobe_handler(args->regs))
-			return NOTIFY_STOP;
+			ret = NOTIFY_STOP;
 		break;
 	case DIE_DEBUG:
 		if (post_kprobe_handler(args->regs))
-			return NOTIFY_STOP;
+			ret = NOTIFY_STOP;
 		break;
 	case DIE_GPF:
-		if (kprobe_running() &&
-		    kprobe_fault_handler(args->regs, args->trapnr))
-			return NOTIFY_STOP;
-		break;
 	case DIE_PAGE_FAULT:
 		if (kprobe_running() &&
 		    kprobe_fault_handler(args->regs, args->trapnr))
-			return NOTIFY_STOP;
+			ret = NOTIFY_STOP;
 		break;
 	default:
 		break;
 	}
-	return NOTIFY_DONE;
+	preempt_enable();
+	return ret;
 }
 
 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
@@ -647,7 +648,6 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 
 void __kprobes jprobe_return(void)
 {
-	preempt_enable_no_resched();
 	asm volatile ("       xchg   %%rbx,%%rsp     \n"
 		      "       int3			\n"
 		      "       .globl jprobe_return_end	\n"
-- 
cgit v1.2.2


From e7a510f92c1e482a7db05afd3cb84af1f4cfe0bc Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:12 -0800
Subject: [PATCH] Kprobes: Track kprobe on a per_cpu basis - x86_64 changes

x86_64 changes to track kprobe execution on a per-cpu basis.  We now track the
kprobe state machine independently on each cpu using a arch specific kprobe
control block.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 129 +++++++++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 59 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index ebfa2c9241ca..6cb40d133b7c 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -44,17 +44,10 @@
 #include <asm/kdebug.h>
 
 static DECLARE_MUTEX(kprobe_mutex);
-
-static struct kprobe *current_kprobe;
-static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
-static struct kprobe *kprobe_prev;
-static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev;
-static struct pt_regs jprobe_saved_regs;
-static long *jprobe_saved_rsp;
 void jprobe_return_end(void);
 
-/* copy of the kernel stack at the probe fire time */
-static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 /*
  * returns non-zero if opcode modifies the interrupt flag.
@@ -236,29 +229,30 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
 	up(&kprobe_mutex);
 }
 
-static inline void save_previous_kprobe(void)
+static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
-	kprobe_prev = current_kprobe;
-	kprobe_status_prev = kprobe_status;
-	kprobe_old_rflags_prev = kprobe_old_rflags;
-	kprobe_saved_rflags_prev = kprobe_saved_rflags;
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+	kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
+	kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
 }
 
-static inline void restore_previous_kprobe(void)
+static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
-	current_kprobe = kprobe_prev;
-	kprobe_status = kprobe_status_prev;
-	kprobe_old_rflags = kprobe_old_rflags_prev;
-	kprobe_saved_rflags = kprobe_saved_rflags_prev;
+	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
+	kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
 }
 
-static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs)
+static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+				struct kprobe_ctlblk *kcb)
 {
-	current_kprobe = p;
-	kprobe_saved_rflags = kprobe_old_rflags
+	__get_cpu_var(current_kprobe) = p;
+	kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
 		= (regs->eflags & (TF_MASK | IF_MASK));
 	if (is_IF_modifier(p->ainsn.insn))
-		kprobe_saved_rflags &= ~IF_MASK;
+		kcb->kprobe_saved_rflags &= ~IF_MASK;
 }
 
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -301,6 +295,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 	struct kprobe *p;
 	int ret = 0;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	/* Check we're not actually recursing */
 	if (kprobe_running()) {
@@ -308,13 +303,13 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS &&
+			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
-				regs->eflags |= kprobe_saved_rflags;
+				regs->eflags |= kcb->kprobe_saved_rflags;
 				unlock_kprobes();
 				goto no_kprobe;
-			} else if (kprobe_status == KPROBE_HIT_SSDONE) {
+			} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
 				/* TODO: Provide re-entrancy from
 				 * post_kprobes_handler() and avoid exception
 				 * stack corruption while single-stepping on
@@ -322,6 +317,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 				 */
 				arch_disarm_kprobe(p);
 				regs->rip = (unsigned long)p->addr;
+				reset_current_kprobe();
 				ret = 1;
 			} else {
 				/* We have reentered the kprobe_handler(), since
@@ -331,15 +327,15 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 				 * of the new probe without calling any user
 				 * handlers.
 				 */
-				save_previous_kprobe();
-				set_current_kprobe(p, regs);
+				save_previous_kprobe(kcb);
+				set_current_kprobe(p, regs, kcb);
 				p->nmissed++;
 				prepare_singlestep(p, regs);
-				kprobe_status = KPROBE_REENTER;
+				kcb->kprobe_status = KPROBE_REENTER;
 				return 1;
 			}
 		} else {
-			p = current_kprobe;
+			p = __get_cpu_var(current_kprobe);
 			if (p->break_handler && p->break_handler(p, regs)) {
 				goto ss_probe;
 			}
@@ -374,8 +370,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 	 * in post_kprobe_handler()
 	 */
 	preempt_disable();
-	kprobe_status = KPROBE_HIT_ACTIVE;
-	set_current_kprobe(p, regs);
+	set_current_kprobe(p, regs, kcb);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 
 	if (p->pre_handler && p->pre_handler(p, regs))
 		/* handler has already set things up, so skip ss setup */
@@ -383,7 +379,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 
 ss_probe:
 	prepare_singlestep(p, regs);
-	kprobe_status = KPROBE_HIT_SS;
+	kcb->kprobe_status = KPROBE_HIT_SS;
 	return 1;
 
 no_kprobe:
@@ -451,6 +447,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
 	regs->rip = orig_ret_address;
 
+	reset_current_kprobe();
 	unlock_kprobes();
 	preempt_enable_no_resched();
 
@@ -484,7 +481,8 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
  * that is atop the stack is the address following the copied instruction.
  * We need to make it the address following the original instruction.
  */
-static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
+static void __kprobes resume_execution(struct kprobe *p,
+		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
 	unsigned long *tos = (unsigned long *)regs->rsp;
 	unsigned long next_rip = 0;
@@ -499,7 +497,7 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
 	switch (*insn) {
 	case 0x9c:		/* pushfl */
 		*tos &= ~(TF_MASK | IF_MASK);
-		*tos |= kprobe_old_rflags;
+		*tos |= kcb->kprobe_old_rflags;
 		break;
 	case 0xc3:		/* ret/lret */
 	case 0xcb:
@@ -544,24 +542,28 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
  */
 int __kprobes post_kprobe_handler(struct pt_regs *regs)
 {
-	if (!kprobe_running())
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
 		return 0;
 
-	if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
-		kprobe_status = KPROBE_HIT_SSDONE;
-		current_kprobe->post_handler(current_kprobe, regs, 0);
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
 	}
 
-	resume_execution(current_kprobe, regs);
-	regs->eflags |= kprobe_saved_rflags;
+	resume_execution(cur, regs, kcb);
+	regs->eflags |= kcb->kprobe_saved_rflags;
 
 	/* Restore the original saved kprobes variables and continue. */
-	if (kprobe_status == KPROBE_REENTER) {
-		restore_previous_kprobe();
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
 		goto out;
 	} else {
 		unlock_kprobes();
 	}
+	reset_current_kprobe();
 out:
 	preempt_enable_no_resched();
 
@@ -579,14 +581,17 @@ out:
 /* Interrupts disabled, kprobe_lock held. */
 int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
-	if (current_kprobe->fault_handler
-	    && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
 		return 1;
 
-	if (kprobe_status & KPROBE_HIT_SS) {
-		resume_execution(current_kprobe, regs);
-		regs->eflags |= kprobe_old_rflags;
+	if (kcb->kprobe_status & KPROBE_HIT_SS) {
+		resume_execution(cur, regs, kcb);
+		regs->eflags |= kcb->kprobe_old_rflags;
 
+		reset_current_kprobe();
 		unlock_kprobes();
 		preempt_enable_no_resched();
 	}
@@ -629,10 +634,11 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 	unsigned long addr;
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-	jprobe_saved_regs = *regs;
-	jprobe_saved_rsp = (long *) regs->rsp;
-	addr = (unsigned long)jprobe_saved_rsp;
+	kcb->jprobe_saved_regs = *regs;
+	kcb->jprobe_saved_rsp = (long *) regs->rsp;
+	addr = (unsigned long)(kcb->jprobe_saved_rsp);
 	/*
 	 * As Linus pointed out, gcc assumes that the callee
 	 * owns the argument space and could overwrite it, e.g.
@@ -640,7 +646,8 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 * we also save and restore enough stack bytes to cover
 	 * the argument area.
 	 */
-	memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
+	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+			MIN_STACK_SIZE(addr));
 	regs->eflags &= ~IF_MASK;
 	regs->rip = (unsigned long)(jp->entry);
 	return 1;
@@ -648,34 +655,38 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 
 void __kprobes jprobe_return(void)
 {
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
 	asm volatile ("       xchg   %%rbx,%%rsp     \n"
 		      "       int3			\n"
 		      "       .globl jprobe_return_end	\n"
 		      "       jprobe_return_end:	\n"
 		      "       nop			\n"::"b"
-		      (jprobe_saved_rsp):"memory");
+		      (kcb->jprobe_saved_rsp):"memory");
 }
 
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 	u8 *addr = (u8 *) (regs->rip - 1);
-	unsigned long stack_addr = (unsigned long)jprobe_saved_rsp;
+	unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 
 	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if ((long *)regs->rsp != jprobe_saved_rsp) {
+		if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
 			struct pt_regs *saved_regs =
-			    container_of(jprobe_saved_rsp, struct pt_regs, rsp);
+			    container_of(kcb->jprobe_saved_rsp,
+					    struct pt_regs, rsp);
 			printk("current rsp %p does not match saved rsp %p\n",
-			       (long *)regs->rsp, jprobe_saved_rsp);
+			       (long *)regs->rsp, kcb->jprobe_saved_rsp);
 			printk("Saved registers for jprobe %p\n", jp);
 			show_registers(saved_regs);
 			printk("Current registers\n");
 			show_registers(regs);
 			BUG();
 		}
-		*regs = jprobe_saved_regs;
-		memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
+		*regs = kcb->jprobe_saved_regs;
+		memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
 		       MIN_STACK_SIZE(stack_addr));
 		return 1;
 	}
-- 
cgit v1.2.2


From 991a51d83a3d9bebfafdd1e692cf310899d60791 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:14 -0800
Subject: [PATCH] Kprobes: Use RCU for (un)register synchronization - arch
 changes

Changes to the arch kprobes infrastructure to take advantage of the locking
changes introduced by usage of RCU for synchronization.  All handlers are now
run without any locks held, so they have to be re-entrant or provide their own
synchronization.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 6cb40d133b7c..9bef2c8dc12c 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -34,7 +34,6 @@
 #include <linux/config.h>
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
-#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/preempt.h>
@@ -266,6 +265,7 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->rip = (unsigned long)p->ainsn.insn;
 }
 
+/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
 				      struct pt_regs *regs)
 {
@@ -299,15 +299,12 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 
 	/* Check we're not actually recursing */
 	if (kprobe_running()) {
-		/* We *are* holding lock here, so this is safe.
-		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
 			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kcb->kprobe_saved_rflags;
-				unlock_kprobes();
 				goto no_kprobe;
 			} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
 				/* TODO: Provide re-entrancy from
@@ -340,14 +337,11 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 				goto ss_probe;
 			}
 		}
-		/* If it's not ours, can't be delete race, (we hold lock). */
 		goto no_kprobe;
 	}
 
-	lock_kprobes();
 	p = get_kprobe(addr);
 	if (!p) {
-		unlock_kprobes();
 		if (*addr != BREAKPOINT_INSTRUCTION) {
 			/*
 			 * The breakpoint instruction was removed right
@@ -406,9 +400,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
         struct kretprobe_instance *ri = NULL;
         struct hlist_head *head;
         struct hlist_node *node, *tmp;
-	unsigned long orig_ret_address = 0;
+	unsigned long flags, orig_ret_address = 0;
 	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
 
+	spin_lock_irqsave(&kretprobe_lock, flags);
         head = kretprobe_inst_table_head(current);
 
 	/*
@@ -448,7 +443,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	regs->rip = orig_ret_address;
 
 	reset_current_kprobe();
-	unlock_kprobes();
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	preempt_enable_no_resched();
 
         /*
@@ -536,10 +531,6 @@ static void __kprobes resume_execution(struct kprobe *p,
 	}
 }
 
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.  And we hold kprobe lock.
- */
 int __kprobes post_kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *cur = kprobe_running();
@@ -560,8 +551,6 @@ int __kprobes post_kprobe_handler(struct pt_regs *regs)
 	if (kcb->kprobe_status == KPROBE_REENTER) {
 		restore_previous_kprobe(kcb);
 		goto out;
-	} else {
-		unlock_kprobes();
 	}
 	reset_current_kprobe();
 out:
@@ -578,7 +567,6 @@ out:
 	return 1;
 }
 
-/* Interrupts disabled, kprobe_lock held. */
 int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe *cur = kprobe_running();
@@ -592,7 +580,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		regs->eflags |= kcb->kprobe_old_rflags;
 
 		reset_current_kprobe();
-		unlock_kprobes();
 		preempt_enable_no_resched();
 	}
 	return 0;
@@ -607,7 +594,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	preempt_disable();
+	rcu_read_lock();
 	switch (val) {
 	case DIE_INT3:
 		if (kprobe_handler(args->regs))
@@ -626,7 +613,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	default:
 		break;
 	}
-	preempt_enable();
+	rcu_read_unlock();
 	return ret;
 }
 
-- 
cgit v1.2.2


From d217d5450f11d8c907c0458d175b0dc999b4d06d Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:14 -0800
Subject: [PATCH] Kprobes: preempt_disable/enable() simplification

Reorganize the preempt_disable/enable calls to eliminate the extra preempt
depth.  Changes based on Paul McKenney's review suggestions for the kprobes
RCU changeset.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 9bef2c8dc12c..dddeb678b440 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -286,16 +286,19 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
         }
 }
 
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
 int __kprobes kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe_ctlblk *kcb;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
 
 	/* Check we're not actually recursing */
 	if (kprobe_running()) {
@@ -359,11 +362,6 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		goto no_kprobe;
 	}
 
-	/*
-	 * This preempt_disable() matches the preempt_enable_no_resched()
-	 * in post_kprobe_handler()
-	 */
-	preempt_disable();
 	set_current_kprobe(p, regs, kcb);
 	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 
@@ -377,6 +375,7 @@ ss_probe:
 	return 1;
 
 no_kprobe:
+	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -448,8 +447,8 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 
         /*
          * By returning a non-zero value, we are telling
-         * kprobe_handler() that we have handled unlocking
-	 * and re-enabling preemption
+         * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
          */
         return 1;
 }
@@ -594,7 +593,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	rcu_read_lock();
 	switch (val) {
 	case DIE_INT3:
 		if (kprobe_handler(args->regs))
@@ -606,14 +604,16 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 		break;
 	case DIE_GPF:
 	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
 		if (kprobe_running() &&
 		    kprobe_fault_handler(args->regs, args->trapnr))
 			ret = NOTIFY_STOP;
+		preempt_enable();
 		break;
 	default:
 		break;
 	}
-	rcu_read_unlock();
 	return ret;
 }
 
@@ -675,6 +675,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 		*regs = kcb->jprobe_saved_regs;
 		memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
 		       MIN_STACK_SIZE(stack_addr));
+		preempt_enable_no_resched();
 		return 1;
 	}
 	return 0;
-- 
cgit v1.2.2


From 5fed0578be842dd7d24e5240a75b02bbc748501f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 7 Nov 2005 01:01:46 -0800
Subject: [PATCH] unexport phys_proc_id and cpu_core_id

EXPORT_SYMBOL's for phys_proc_id and cpu_core_id were added this year but
never used.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smpboot.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 658a81b33f3b..4b5b088ec102 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -65,8 +65,6 @@ int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
 u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
-EXPORT_SYMBOL(phys_proc_id);
-EXPORT_SYMBOL(cpu_core_id);
 
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
-- 
cgit v1.2.2


From b05a581d4865d74c0e270d27156a88d2dee9494e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Nov 2005 21:35:08 -0800
Subject: [PATCH] move some COMPATIBLE_IOCTL entries from x86_64 to common code

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/ia32_ioctl.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
index 4ba0e293d5e5..e335bd0b637d 100644
--- a/arch/x86_64/ia32/ia32_ioctl.c
+++ b/arch/x86_64/ia32/ia32_ioctl.c
@@ -64,12 +64,6 @@ struct ioctl_trans ioctl_start[] = {
 #include <linux/compat_ioctl.h>
 #define DECLARES
 #include "compat_ioctl.c"
-COMPATIBLE_IOCTL(HDIO_SET_KEEPSETTINGS)
-COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
-COMPATIBLE_IOCTL(BLKRASET)
-COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
-COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
-COMPATIBLE_IOCTL(FIOQSIZE)
 
 /* And these ioctls need translation */
 /* realtime device */
-- 
cgit v1.2.2


From 5bfb5d690f36d316a5f3b4f7775fda996faa6b12 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 8 Nov 2005 21:39:01 -0800
Subject: [PATCH] sched: disable preempt in idle tasks

Run idle threads with preempt disabled.

Also corrected a bugs in arm26's cpu_idle (make it actually call schedule()).
How did it ever work before?

Might fix the CPU hotplugging hang which Nigel Cunningham noted.

We think the bug hits if the idle thread is preempted after checking
need_resched() and before going to sleep, then the CPU offlined.

After calling stop_machine_run, the CPU eventually returns from preemption and
into the idle thread and goes to sleep.  The CPU will continue executing
previous idle and have no chance to call play_dead.

By disabling preemption until we are ready to explicitly schedule, this bug is
fixed and the idle threads generally become more robust.

From: alexs <ashepard@u.washington.edu>

  PPC build fix

From: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>

  MIPS build fix

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c | 2 ++
 arch/x86_64/kernel/smpboot.c | 1 +
 2 files changed, 3 insertions(+)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index b5a89c0bdf59..571f9fe490ce 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -204,7 +204,9 @@ void cpu_idle (void)
 			idle();
 		}
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 4b5b088ec102..c4e59bbdc187 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -472,6 +472,7 @@ void __cpuinit start_secondary(void)
 	 * things done here to the most necessary things.
 	 */
 	cpu_init();
+	preempt_disable();
 	smp_callin();
 
 	/* otherwise gcc will move up the smp_processor_id before the cpu_init */
-- 
cgit v1.2.2


From 64c7c8f88559624abdbe12b5da6502e8879f8d28 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 8 Nov 2005 21:39:04 -0800
Subject: [PATCH] sched: resched and cpu_idle rework

Make some changes to the NEED_RESCHED and POLLING_NRFLAG to reduce
confusion, and make their semantics rigid.  Improves efficiency of
resched_task and some cpu_idle routines.

* In resched_task:
- TIF_NEED_RESCHED is only cleared with the task's runqueue lock held,
  and as we hold it during resched_task, then there is no need for an
  atomic test and set there. The only other time this should be set is
  when the task's quantum expires, in the timer interrupt - this is
  protected against because the rq lock is irq-safe.

- If TIF_NEED_RESCHED is set, then we don't need to do anything. It
  won't get unset until the task get's schedule()d off.

- If we are running on the same CPU as the task we resched, then set
  TIF_NEED_RESCHED and no further action is required.

- If we are running on another CPU, and TIF_POLLING_NRFLAG is *not* set
  after TIF_NEED_RESCHED has been set, then we need to send an IPI.

Using these rules, we are able to remove the test and set operation in
resched_task, and make clear the previously vague semantics of
POLLING_NRFLAG.

* In idle routines:
- Enter cpu_idle with preempt disabled. When the need_resched() condition
  becomes true, explicitly call schedule(). This makes things a bit clearer
  (IMO), but haven't updated all architectures yet.

- Many do a test and clear of TIF_NEED_RESCHED for some reason. According
  to the resched_task rules, this isn't needed (and actually breaks the
  assumption that TIF_NEED_RESCHED is only cleared with the runqueue lock
  held). So remove that. Generally one less locked memory op when switching
  to the idle thread.

- Many idle routines clear TIF_POLLING_NRFLAG, and only set it in the inner
  most polling idle loops. The above resched_task semantics allow it to be
  set until before the last time need_resched() is checked before going into
  a halt requiring interrupt wakeup.

  Many idle routines simply never enter such a halt, and so POLLING_NRFLAG
  can be always left set, completely eliminating resched IPIs when rescheduling
  the idle task.

  POLLING_NRFLAG width can be increased, to reduce the chance of resched IPIs.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c | 67 ++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 571f9fe490ce..59be85d9a4bc 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -86,12 +86,22 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void default_idle(void)
 {
+	local_irq_enable();
+
 	if (!atomic_read(&hlt_counter)) {
-		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
-		else
-			local_irq_enable();
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb__after_clear_bit();
+		while (!need_resched()) {
+			local_irq_disable();
+			if (!need_resched())
+				safe_halt();
+			else
+				local_irq_enable();
+		}
+		set_thread_flag(TIF_POLLING_NRFLAG);
+	} else {
+		while (!need_resched())
+			cpu_relax();
 	}
 }
 
@@ -102,30 +112,16 @@ void default_idle(void)
  */
 static void poll_idle (void)
 {
-	int oldval;
-
 	local_irq_enable();
 
-	/*
-	 * Deal with another CPU just having chosen a thread to
-	 * run here:
-	 */
-	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
-	if (!oldval) {
-		set_thread_flag(TIF_POLLING_NRFLAG); 
-		asm volatile(
-			"2:"
-			"testl %0,%1;"
-			"rep; nop;"
-			"je 2b;"
-			: :
-			"i" (_TIF_NEED_RESCHED), 
-			"m" (current_thread_info()->flags));
-		clear_thread_flag(TIF_POLLING_NRFLAG);
-	} else {
-		set_need_resched();
-	}
+	asm volatile(
+		"2:"
+		"testl %0,%1;"
+		"rep; nop;"
+		"je 2b;"
+		: :
+		"i" (_TIF_NEED_RESCHED),
+		"m" (current_thread_info()->flags));
 }
 
 void cpu_idle_wait(void)
@@ -187,6 +183,8 @@ static inline void play_dead(void)
  */
 void cpu_idle (void)
 {
+	set_thread_flag(TIF_POLLING_NRFLAG);
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
@@ -221,15 +219,12 @@ static void mwait_idle(void)
 {
 	local_irq_enable();
 
-	if (!need_resched()) {
-		set_thread_flag(TIF_POLLING_NRFLAG);
-		do {
-			__monitor((void *)&current_thread_info()->flags, 0, 0);
-			if (need_resched())
-				break;
-			__mwait(0, 0);
-		} while (!need_resched());
-		clear_thread_flag(TIF_POLLING_NRFLAG);
+	while (!need_resched()) {
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (need_resched())
+			break;
+		__mwait(0, 0);
 	}
 }
 
-- 
cgit v1.2.2


From d6c7ac081bf6cafcf780b919ee97978f1d01a0d7 Mon Sep 17 00:00:00 2001
From: Karsten Wiese <annabellesgarden@yahoo.de>
Date: Sun, 13 Nov 2005 16:06:22 -0800
Subject: [PATCH] x86_64 two timer entries in /sys

attached patch renames one instance of
	/sys/devices/system/timer
to
	/sys/devices/system/timer_pit
to avoid a name clash with another instance created in time.c.

Acked-by: Andi Kleen <ak@muc.de>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/i8259.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index c6c9791d77c1..a9368d4c4aba 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -515,7 +515,7 @@ void i8254_timer_resume(void)
 }
 
 static struct sysdev_class timer_sysclass = {
-	set_kset_name("timer"),
+	set_kset_name("timer_pit"),
 	.resume		= timer_resume,
 };
 
-- 
cgit v1.2.2


From 56720367cd89ef5265f39da2d674c5b92cd4cd87 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Update defconfig

Rerun and enable autofs 4, relayfs and softdog

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/defconfig | 98 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 83 insertions(+), 15 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index f8db7e500fbf..5d56542fb68f 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.13-git11
-# Mon Sep 12 16:16:16 2005
+# Linux kernel version: 2.6.14-git7
+# Sat Nov  5 15:55:50 2005
 #
 CONFIG_X86_64=y
 CONFIG_64BIT=y
@@ -35,7 +35,7 @@ CONFIG_POSIX_MQUEUE=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 CONFIG_SYSCTL=y
 # CONFIG_AUDIT is not set
-# CONFIG_HOTPLUG is not set
+CONFIG_HOTPLUG=y
 CONFIG_KOBJECT_UEVENT=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
@@ -93,10 +93,11 @@ CONFIG_PREEMPT_NONE=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 # CONFIG_PREEMPT is not set
 CONFIG_PREEMPT_BKL=y
+CONFIG_NUMA=y
 CONFIG_K8_NUMA=y
+CONFIG_X86_64_ACPI_NUMA=y
 # CONFIG_NUMA_EMU is not set
 CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
-CONFIG_NUMA=y
 CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
 CONFIG_ARCH_SPARSEMEM_ENABLE=y
 CONFIG_SELECT_MEMORY_MODEL=y
@@ -107,9 +108,10 @@ CONFIG_DISCONTIGMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
 CONFIG_NEED_MULTIPLE_NODES=y
 # CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPLIT_PTLOCK_CPUS=4
 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
-CONFIG_HAVE_DEC_LOCK=y
 CONFIG_NR_CPUS=32
+CONFIG_HOTPLUG_CPU=y
 CONFIG_HPET_TIMER=y
 CONFIG_X86_PM_TIMER=y
 CONFIG_HPET_EMULATE_RTC=y
@@ -117,6 +119,7 @@ CONFIG_GART_IOMMU=y
 CONFIG_SWIOTLB=y
 CONFIG_X86_MCE=y
 CONFIG_X86_MCE_INTEL=y
+CONFIG_X86_MCE_AMD=y
 CONFIG_PHYSICAL_START=0x100000
 # CONFIG_KEXEC is not set
 CONFIG_SECCOMP=y
@@ -136,11 +139,15 @@ CONFIG_PM=y
 # CONFIG_PM_DEBUG is not set
 CONFIG_SOFTWARE_SUSPEND=y
 CONFIG_PM_STD_PARTITION=""
+CONFIG_SUSPEND_SMP=y
 
 #
 # ACPI (Advanced Configuration and Power Interface) Support
 #
 CONFIG_ACPI=y
+CONFIG_ACPI_SLEEP=y
+CONFIG_ACPI_SLEEP_PROC_FS=y
+CONFIG_ACPI_SLEEP_PROC_SLEEP=y
 CONFIG_ACPI_AC=y
 CONFIG_ACPI_BATTERY=y
 CONFIG_ACPI_BUTTON=y
@@ -148,6 +155,7 @@ CONFIG_ACPI_BUTTON=y
 CONFIG_ACPI_HOTKEY=m
 CONFIG_ACPI_FAN=y
 CONFIG_ACPI_PROCESSOR=y
+CONFIG_ACPI_HOTPLUG_CPU=y
 CONFIG_ACPI_THERMAL=y
 CONFIG_ACPI_NUMA=y
 # CONFIG_ACPI_ASUS is not set
@@ -158,7 +166,7 @@ CONFIG_ACPI_BLACKLIST_YEAR=2001
 CONFIG_ACPI_EC=y
 CONFIG_ACPI_POWER=y
 CONFIG_ACPI_SYSTEM=y
-# CONFIG_ACPI_CONTAINER is not set
+CONFIG_ACPI_CONTAINER=y
 
 #
 # CPU Frequency scaling
@@ -293,7 +301,6 @@ CONFIG_IPV6=y
 # Network testing
 #
 # CONFIG_NET_PKTGEN is not set
-# CONFIG_NETFILTER_NETLINK is not set
 # CONFIG_HAMRADIO is not set
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
@@ -311,6 +318,11 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y
 # CONFIG_FW_LOADER is not set
 # CONFIG_DEBUG_DRIVER is not set
 
+#
+# Connector - unified userspace <-> kernelspace linker
+#
+# CONFIG_CONNECTOR is not set
+
 #
 # Memory Technology Devices (MTD)
 #
@@ -354,6 +366,11 @@ CONFIG_IOSCHED_NOOP=y
 # CONFIG_IOSCHED_AS is not set
 CONFIG_IOSCHED_DEADLINE=y
 CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+CONFIG_DEFAULT_DEADLINE=y
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="cfq"
 # CONFIG_ATA_OVER_ETH is not set
 
 #
@@ -450,6 +467,7 @@ CONFIG_BLK_DEV_SD=y
 CONFIG_SCSI_SPI_ATTRS=y
 # CONFIG_SCSI_FC_ATTRS is not set
 # CONFIG_SCSI_ISCSI_ATTRS is not set
+# CONFIG_SCSI_SAS_ATTRS is not set
 
 #
 # SCSI low-level drivers
@@ -469,20 +487,24 @@ CONFIG_AIC79XX_DEBUG_MASK=0
 # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
 # CONFIG_MEGARAID_NEWGEN is not set
 # CONFIG_MEGARAID_LEGACY is not set
+# CONFIG_MEGARAID_SAS is not set
 CONFIG_SCSI_SATA=y
 # CONFIG_SCSI_SATA_AHCI is not set
 # CONFIG_SCSI_SATA_SVW is not set
 CONFIG_SCSI_ATA_PIIX=y
 # CONFIG_SCSI_SATA_MV is not set
-# CONFIG_SCSI_SATA_NV is not set
-# CONFIG_SCSI_SATA_PROMISE is not set
+CONFIG_SCSI_SATA_NV=y
+# CONFIG_SCSI_PDC_ADMA is not set
 # CONFIG_SCSI_SATA_QSTOR is not set
+# CONFIG_SCSI_SATA_PROMISE is not set
 # CONFIG_SCSI_SATA_SX4 is not set
 # CONFIG_SCSI_SATA_SIL is not set
+# CONFIG_SCSI_SATA_SIL24 is not set
 # CONFIG_SCSI_SATA_SIS is not set
 # CONFIG_SCSI_SATA_ULI is not set
 CONFIG_SCSI_SATA_VIA=y
 # CONFIG_SCSI_SATA_VITESSE is not set
+CONFIG_SCSI_SATA_INTEL_COMBINED=y
 # CONFIG_SCSI_BUSLOGIC is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_EATA is not set
@@ -525,6 +547,7 @@ CONFIG_BLK_DEV_DM=y
 CONFIG_FUSION=y
 CONFIG_FUSION_SPI=y
 # CONFIG_FUSION_FC is not set
+# CONFIG_FUSION_SAS is not set
 CONFIG_FUSION_MAX_SGE=128
 # CONFIG_FUSION_CTL is not set
 
@@ -564,6 +587,7 @@ CONFIG_NET_ETHERNET=y
 CONFIG_MII=y
 # CONFIG_HAPPYMEAL is not set
 # CONFIG_SUNGEM is not set
+# CONFIG_CASSINI is not set
 CONFIG_NET_VENDOR_3COM=y
 CONFIG_VORTEX=y
 # CONFIG_TYPHOON is not set
@@ -740,7 +764,43 @@ CONFIG_LEGACY_PTY_COUNT=256
 #
 # Watchdog Cards
 #
-# CONFIG_WATCHDOG is not set
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=y
+# CONFIG_ACQUIRE_WDT is not set
+# CONFIG_ADVANTECH_WDT is not set
+# CONFIG_ALIM1535_WDT is not set
+# CONFIG_ALIM7101_WDT is not set
+# CONFIG_SC520_WDT is not set
+# CONFIG_EUROTECH_WDT is not set
+# CONFIG_IB700_WDT is not set
+# CONFIG_IBMASR is not set
+# CONFIG_WAFER_WDT is not set
+# CONFIG_I6300ESB_WDT is not set
+# CONFIG_I8XX_TCO is not set
+# CONFIG_SC1200_WDT is not set
+# CONFIG_60XX_WDT is not set
+# CONFIG_SBC8360_WDT is not set
+# CONFIG_CPU5_WDT is not set
+# CONFIG_W83627HF_WDT is not set
+# CONFIG_W83877F_WDT is not set
+# CONFIG_W83977F_WDT is not set
+# CONFIG_MACHZ_WDT is not set
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
 CONFIG_HW_RANDOM=y
 # CONFIG_NVRAM is not set
 CONFIG_RTC=y
@@ -767,6 +827,7 @@ CONFIG_MAX_RAW_DEVS=256
 # TPM devices
 #
 # CONFIG_TCG_TPM is not set
+# CONFIG_TELCLOCK is not set
 
 #
 # I2C support
@@ -783,6 +844,7 @@ CONFIG_MAX_RAW_DEVS=256
 #
 CONFIG_HWMON=y
 # CONFIG_HWMON_VID is not set
+# CONFIG_SENSORS_HDAPS is not set
 # CONFIG_HWMON_DEBUG_CHIP is not set
 
 #
@@ -886,12 +948,15 @@ CONFIG_USB_UHCI_HCD=y
 # USB Device Class drivers
 #
 # CONFIG_OBSOLETE_OSS_USB_DRIVER is not set
-# CONFIG_USB_BLUETOOTH_TTY is not set
 # CONFIG_USB_ACM is not set
 CONFIG_USB_PRINTER=y
 
 #
-# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+#
+
+#
+# may also be needed; see USB_STORAGE Help for more information
 #
 CONFIG_USB_STORAGE=y
 # CONFIG_USB_STORAGE_DEBUG is not set
@@ -924,6 +989,7 @@ CONFIG_USB_HIDINPUT=y
 # CONFIG_USB_XPAD is not set
 # CONFIG_USB_ATI_REMOTE is not set
 # CONFIG_USB_KEYSPAN_REMOTE is not set
+# CONFIG_USB_APPLETOUCH is not set
 
 #
 # USB Imaging devices
@@ -1005,7 +1071,7 @@ CONFIG_USB_MON=y
 #
 # CONFIG_EDD is not set
 # CONFIG_DELL_RBU is not set
-CONFIG_DCDBAS=m
+# CONFIG_DCDBAS is not set
 
 #
 # File systems
@@ -1037,7 +1103,7 @@ CONFIG_INOTIFY=y
 # CONFIG_QUOTA is not set
 CONFIG_DNOTIFY=y
 CONFIG_AUTOFS_FS=y
-# CONFIG_AUTOFS4_FS is not set
+CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 
 #
@@ -1068,7 +1134,7 @@ CONFIG_TMPFS=y
 CONFIG_HUGETLBFS=y
 CONFIG_HUGETLB_PAGE=y
 CONFIG_RAMFS=y
-# CONFIG_RELAYFS_FS is not set
+CONFIG_RELAYFS_FS=y
 
 #
 # Miscellaneous filesystems
@@ -1186,7 +1252,9 @@ CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_DEBUG_KOBJECT is not set
 # CONFIG_DEBUG_INFO is not set
 CONFIG_DEBUG_FS=y
+# CONFIG_DEBUG_VM is not set
 # CONFIG_FRAME_POINTER is not set
+# CONFIG_RCU_TORTURE_TEST is not set
 CONFIG_INIT_DEBUG=y
 # CONFIG_IOMMU_DEBUG is not set
 CONFIG_KPROBES=y
-- 
cgit v1.2.2


From a2f1b424900715ed9d1699c3bb88a434a2b42bc0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Add 4GB DMA32 zone

Add a new 4GB GFP_DMA32 zone between the GFP_DMA and GFP_NORMAL zones.

As a bit of historical background: when the x86-64 port
was originally designed we had some discussion if we should
use a 16MB DMA zone like i386 or a 4GB DMA zone like IA64 or
both. Both was ruled out at this point because it was in early
2.4 when VM is still quite shakey and had bad troubles even
dealing with one DMA zone.  We settled on the 16MB DMA zone mainly
because we worried about older soundcards and the floppy.

But this has always caused problems since then because
device drivers had trouble getting enough DMA able memory. These days
the VM works much better and the wide use of NUMA has proven
it can deal with many zones successfully.

So this patch adds both zones.

This helps drivers who need a lot of memory below 4GB because
their hardware is not accessing more (graphic drivers - proprietary
and free ones, video frame buffer drivers, sound drivers etc.).
Previously they could only use IOMMU+16MB GFP_DMA, which
was not enough memory.

Another common problem is that hardware who has full memory
addressing for >4GB misses it for some control structures in memory
(like transmit rings or other metadata).  They tended to allocate memory
in the 16MB GFP_DMA or the IOMMU/swiotlb then using pci_alloc_consistent,
but that can tie up a lot of precious 16MB GFPDMA/IOMMU/swiotlb memory
(even on AMD systems the IOMMU tends to be quite small) especially if you have
many devices.  With the new zone pci_alloc_consistent can just put
this stuff into memory below 4GB which works better.

One argument was still if the zone should be 4GB or 2GB. The main
motivation for 2GB would be an unnamed not so unpopular hardware
raid controller (mostly found in older machines from a particular four letter
company) who has a strange 2GB restriction in firmware. But
that one works ok with swiotlb/IOMMU anyways, so it doesn't really
need GFP_DMA32. I chose 4GB to be compatible with IA64 and because
it seems to be the most common restriction.

The new zone is so far added only for x86-64.

For other architectures who don't set up this
new zone nothing changes. Architectures can set a compatibility
define in Kconfig CONFIG_DMA_IS_DMA32 that will define GFP_DMA32
as GFP_DMA. Otherwise it's a nop because on 32bit architectures
it's normally not needed because GFP_NORMAL (=0) is DMA able
enough.

One problem is still that GFP_DMA means different things on different
architectures. e.g. some drivers used to have #ifdef ia64  use GFP_DMA
(trusting it to be 4GB) #elif __x86_64__ (use other hacks like
the swiotlb because 16MB is not enough) ... . This was quite
ugly and is now obsolete.

These should be now converted to use GFP_DMA32 unconditionally. I haven't done
this yet. Or best only use pci_alloc_consistent/dma_alloc_coherent
which will use GFP_DMA32 transparently.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/init.c | 65 +++++++++++++++++++++++++++++++++------------------
 arch/x86_64/mm/numa.c | 25 ++++----------------
 2 files changed, 47 insertions(+), 43 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e60a1a848de8..a1ad4cc423a7 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -318,32 +318,51 @@ void zap_low_mappings(void)
 	flush_tlb_all();
 }
 
+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
+__init void
+size_zones(unsigned long *z, unsigned long *h,
+	   unsigned long start_pfn, unsigned long end_pfn)
+{
+ 	int i;
+ 	unsigned long w;
+
+ 	for (i = 0; i < MAX_NR_ZONES; i++)
+ 		z[i] = 0;
+
+ 	if (start_pfn < MAX_DMA_PFN)
+ 		z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
+ 	if (start_pfn < MAX_DMA32_PFN) {
+ 		unsigned long dma32_pfn = MAX_DMA32_PFN;
+ 		if (dma32_pfn > end_pfn)
+ 			dma32_pfn = end_pfn;
+ 		z[ZONE_DMA32] = dma32_pfn - start_pfn;
+ 	}
+ 	z[ZONE_NORMAL] = end_pfn - start_pfn;
+
+ 	/* Remove lower zones from higher ones. */
+ 	w = 0;
+ 	for (i = 0; i < MAX_NR_ZONES; i++) {
+ 		if (z[i])
+ 			z[i] -= w;
+ 	        w += z[i];
+	}
+
+	/* Compute holes */
+	w = 0;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		unsigned long s = w;
+		w += z[i];
+		h[i] = e820_hole_size(s, w);
+	}
+}
+
 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
-	{
-		unsigned long zones_size[MAX_NR_ZONES];
-		unsigned long holes[MAX_NR_ZONES];
-		unsigned int max_dma;
-
-		memset(zones_size, 0, sizeof(zones_size));
-		memset(holes, 0, sizeof(holes));
-
-		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-		if (end_pfn < max_dma) {
-			zones_size[ZONE_DMA] = end_pfn;
-			holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
-		} else {
-			zones_size[ZONE_DMA] = max_dma;
-			holes[ZONE_DMA] = e820_hole_size(0, max_dma);
-			zones_size[ZONE_NORMAL] = end_pfn - max_dma;
-			holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
-		}
-		free_area_init_node(0, NODE_DATA(0), zones_size,
-                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
-	}
-	return;
+	unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+	size_zones(zones, holes, 0, end_pfn);
+	free_area_init_node(0, NODE_DATA(0), zones,
+			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
 }
 #endif
 
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 214803821001..18e86e2eac2d 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -132,29 +132,14 @@ void __init setup_node_zones(int nodeid)
 	unsigned long start_pfn, end_pfn; 
 	unsigned long zones[MAX_NR_ZONES];
 	unsigned long holes[MAX_NR_ZONES];
-	unsigned long dma_end_pfn;
 
-	memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 
-	memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
+ 	start_pfn = node_start_pfn(nodeid);
+ 	end_pfn = node_end_pfn(nodeid);
 
-	start_pfn = node_start_pfn(nodeid);
-	end_pfn = node_end_pfn(nodeid);
+	Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
+		nodeid, start_pfn, end_pfn);
 
-	Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
-	
-	/* All nodes > 0 have a zero length zone DMA */ 
-	dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 
-	if (start_pfn < dma_end_pfn) { 
-		zones[ZONE_DMA] = dma_end_pfn - start_pfn;
-		holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
-		zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 
-		holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
-
-	} else { 
-		zones[ZONE_NORMAL] = end_pfn - start_pfn; 
-		holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
-	} 
-    
+	size_zones(zones, holes, start_pfn, end_pfn);
 	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
 			    start_pfn, holes);
 } 
-- 
cgit v1.2.2


From e18c6874a505958d153a11f9d6947971c349008a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Account mem_map in VM holes accounting

The VM needs to know about lost memory in zones to accurately
balance dirty pages. This patch accounts mem_map in there too,
which fixes a constant errror of a few percent. Also some
other misc mappings and the kernel text itself are accounted
too.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/init.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index a1ad4cc423a7..2b1d6c382396 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -47,6 +47,8 @@ extern int swiotlb;
 
 extern char _stext[];
 
+static unsigned long dma_reserve __initdata;
+
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
 /*
@@ -354,6 +356,21 @@ size_zones(unsigned long *z, unsigned long *h,
 		w += z[i];
 		h[i] = e820_hole_size(s, w);
 	}
+
+	/* Add the space pace needed for mem_map to the holes too. */
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
+
+	/* The 16MB DMA zone has the kernel and other misc mappings.
+ 	   Account them too */
+	if (h[ZONE_DMA]) {
+		h[ZONE_DMA] += dma_reserve;
+		if (h[ZONE_DMA] >= z[ZONE_DMA]) {
+			printk(KERN_WARNING
+				"Kernel too large and filling up ZONE_DMA?\n");
+			h[ZONE_DMA] = z[ZONE_DMA];
+		}
+	}
 }
 
 #ifndef CONFIG_NUMA
@@ -510,6 +527,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 #else       		
 	reserve_bootmem(phys, len);    
 #endif
+	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
+		dma_reserve += len / PAGE_SIZE;
 }
 
 int kern_addr_valid(unsigned long addr) 
-- 
cgit v1.2.2


From 89b831ef8bf5cfbb357dbc0a2e07700d7f20eec5 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.shin@amd.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Support for AMD specific MCE Threshold.

MC4_MISC - DRAM Errors Threshold Register realized under AMD K8 Rev F.
This register is used to count correctable and uncorrectable ECC errors that occur during DRAM read operations.
The user may interface through sysfs files in order to change the threshold configuration.

bank%d/error_count - reads current error count, write to clear.
bank%d/interrupt_enable - set/clear interrupt enable.
bank%d/threshold_limit - read/write the threshold limit.

APIC vector 0xF9 in hw_irq.h.
5 software defined bank ids in mce.h.
new apic.c function to setup threshold apic lvt.
defaults to interrupt off, count enabled, and threshold limit max.
sysfs interface created on /sys/devices/system/threshold.

AK: added some ifdefs to make it compile on UP

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig          |   8 +
 arch/x86_64/kernel/Makefile  |   1 +
 arch/x86_64/kernel/apic.c    |  10 +
 arch/x86_64/kernel/entry.S   |   3 +
 arch/x86_64/kernel/i8259.c   |   2 +
 arch/x86_64/kernel/mce.c     |   3 +
 arch/x86_64/kernel/mce_amd.c | 538 +++++++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/traps.c   |   4 +
 8 files changed, 569 insertions(+)
 create mode 100644 arch/x86_64/kernel/mce_amd.c

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 21afa69a086d..504dc52e8bfa 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -374,6 +374,14 @@ config X86_MCE_INTEL
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
 
+config X86_MCE_AMD
+	bool "AMD MCE features"
+	depends on X86_MCE && X86_LOCAL_APIC
+	default y
+	help
+	   Additional support for AMD specific MCE features such as
+	   the DRAM Error Threshold.
+
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if EMBEDDED
 	default "0x100000"
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 14328cab5d3a..fe4cbd1c4b2f 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \
 
 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
+obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd.o
 obj-$(CONFIG_MTRR)		+= ../../i386/kernel/cpu/mtrr/
 obj-$(CONFIG_ACPI)		+= acpi/
 obj-$(CONFIG_X86_MSR)		+= msr.o
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index b6e7715d877f..18691ce4c759 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -833,6 +833,16 @@ int setup_profiling_timer(unsigned int multiplier)
 	return 0;
 }
 
+#ifdef CONFIG_X86_MCE_AMD
+void setup_threshold_lvt(unsigned long lvt_off)
+{
+	unsigned int v = 0;
+	unsigned long reg = (lvt_off << 4) + 0x500;
+	v |= THRESHOLD_APIC_VECTOR;
+	apic_write(reg, v);
+}
+#endif /* CONFIG_X86_MCE_AMD */
+
 #undef APIC_DIVISOR
 
 /*
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 7937971d1853..9ff42041bb6b 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -612,6 +612,9 @@ retint_kernel:
 ENTRY(thermal_interrupt)
 	apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
 
+ENTRY(threshold_interrupt)
+	apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
+
 #ifdef CONFIG_SMP	
 ENTRY(reschedule_interrupt)
 	apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index c6c9791d77c1..5de30035e54b 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -492,6 +492,7 @@ void invalidate_interrupt5(void);
 void invalidate_interrupt6(void);
 void invalidate_interrupt7(void);
 void thermal_interrupt(void);
+void threshold_interrupt(void);
 void i8254_timer_resume(void);
 
 static void setup_timer_hardware(void)
@@ -580,6 +581,7 @@ void __init init_IRQ(void)
 	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 #endif	
 	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 69541db5ff2c..cf8a76f0f47e 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -356,6 +356,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 	case X86_VENDOR_INTEL:
 		mce_intel_feature_init(c);
 		break;
+	case X86_VENDOR_AMD:
+		mce_amd_feature_init(c);
+		break;
 	default:
 		break;
 	}
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
new file mode 100644
index 000000000000..1f76175ace02
--- /dev/null
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -0,0 +1,538 @@
+/*
+ *  (c) 2005 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ *
+ *  Written by Jacob Shin - AMD, Inc.
+ *
+ *  Support : jacob.shin@amd.com
+ *
+ *  MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F.
+ *  MC4_MISC0 exists per physical processor.
+ *
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/sysdev.h>
+#include <linux/sysfs.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/percpu.h>
+
+#define PFX "mce_threshold: "
+#define VERSION "version 1.00.9"
+#define NR_BANKS 5
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_OVERFLOW 0x0001000000000000L
+
+struct threshold_bank {
+	unsigned int cpu;
+	u8 bank;
+	u8 interrupt_enable;
+	u16 threshold_limit;
+	struct kobject kobj;
+};
+
+static struct threshold_bank threshold_defaults = {
+	.interrupt_enable = 0,
+	.threshold_limit = THRESHOLD_MAX,
+};
+
+#ifdef CONFIG_SMP
+static unsigned char shared_bank[NR_BANKS] = {
+	0, 0, 0, 0, 1
+};
+#endif
+
+static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
+
+/*
+ * CPU Initialization
+ */
+
+/* must be called with correct cpu affinity */
+static void threshold_restart_bank(struct threshold_bank *b,
+				   int reset, u16 old_limit)
+{
+	u32 mci_misc_hi, mci_misc_lo;
+
+	rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+
+	if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+		reset = 1;	/* limit cannot be lower than err count */
+
+	if (reset) {		/* reset err count and overflow bit */
+		mci_misc_hi =
+		    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+		    (THRESHOLD_MAX - b->threshold_limit);
+	} else if (old_limit) {	/* change limit w/o reset */
+		int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+		    (old_limit - b->threshold_limit);
+		mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+		    (new_count & THRESHOLD_MAX);
+	}
+
+	b->interrupt_enable ?
+	    (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+	    (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+
+	mci_misc_hi |= MASK_COUNT_EN_HI;
+	wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+}
+
+void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+	int bank;
+	u32 mci_misc_lo, mci_misc_hi;
+	unsigned int cpu = smp_processor_id();
+
+	for (bank = 0; bank < NR_BANKS; ++bank) {
+		rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi);
+
+		/* !valid, !counter present, bios locked */
+		if (!(mci_misc_hi & MASK_VALID_HI) ||
+		    !(mci_misc_hi & MASK_VALID_HI >> 1) ||
+		    (mci_misc_hi & MASK_VALID_HI >> 2))
+			continue;
+
+		per_cpu(bank_map, cpu) |= (1 << bank);
+
+#ifdef CONFIG_SMP
+		if (shared_bank[bank] && cpu_core_id[cpu])
+			continue;
+#endif
+
+		setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20);
+		threshold_defaults.cpu = cpu;
+		threshold_defaults.bank = bank;
+		threshold_restart_bank(&threshold_defaults, 0, 0);
+	}
+}
+
+/*
+ * APIC Interrupt Handler
+ */
+
+/*
+ * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
+ * the interrupt goes off when error_count reaches threshold_limit.
+ * the handler will simply log mcelog w/ software defined bank number.
+ */
+asmlinkage void mce_threshold_interrupt(void)
+{
+	int bank;
+	struct mce m;
+
+	ack_APIC_irq();
+	irq_enter();
+
+	memset(&m, 0, sizeof(m));
+	rdtscll(m.tsc);
+	m.cpu = smp_processor_id();
+
+	/* assume first bank caused it */
+	for (bank = 0; bank < NR_BANKS; ++bank) {
+		m.bank = MCE_THRESHOLD_BASE + bank;
+		rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc);
+
+		if (m.misc & MASK_OVERFLOW) {
+			mce_log(&m);
+			goto out;
+		}
+	}
+      out:
+	irq_exit();
+}
+
+/*
+ * Sysfs Interface
+ */
+
+static struct sysdev_class threshold_sysclass = {
+	set_kset_name("threshold"),
+};
+
+static DEFINE_PER_CPU(struct sys_device, device_threshold);
+
+struct threshold_attr {
+        struct attribute attr;
+        ssize_t(*show) (struct threshold_bank *, char *);
+        ssize_t(*store) (struct threshold_bank *, const char *, size_t count);
+};
+
+static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+
+static cpumask_t affinity_set(unsigned int cpu)
+{
+	cpumask_t oldmask = current->cpus_allowed;
+	cpumask_t newmask = CPU_MASK_NONE;
+	cpu_set(cpu, newmask);
+	set_cpus_allowed(current, newmask);
+	return oldmask;
+}
+
+static void affinity_restore(cpumask_t oldmask)
+{
+	set_cpus_allowed(current, oldmask);
+}
+
+#define SHOW_FIELDS(name) \
+        static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \
+        { \
+                return sprintf(buf, "%lx\n", (unsigned long) b->name); \
+        }
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t store_interrupt_enable(struct threshold_bank *b,
+				      const char *buf, size_t count)
+{
+	char *end;
+	cpumask_t oldmask;
+	unsigned long new = simple_strtoul(buf, &end, 0);
+	if (end == buf)
+		return -EINVAL;
+	b->interrupt_enable = !!new;
+
+	oldmask = affinity_set(b->cpu);
+	threshold_restart_bank(b, 0, 0);
+	affinity_restore(oldmask);
+
+	return end - buf;
+}
+
+static ssize_t store_threshold_limit(struct threshold_bank *b,
+				     const char *buf, size_t count)
+{
+	char *end;
+	cpumask_t oldmask;
+	u16 old;
+	unsigned long new = simple_strtoul(buf, &end, 0);
+	if (end == buf)
+		return -EINVAL;
+	if (new > THRESHOLD_MAX)
+		new = THRESHOLD_MAX;
+	if (new < 1)
+		new = 1;
+	old = b->threshold_limit;
+	b->threshold_limit = new;
+
+	oldmask = affinity_set(b->cpu);
+	threshold_restart_bank(b, 0, old);
+	affinity_restore(oldmask);
+
+	return end - buf;
+}
+
+static ssize_t show_error_count(struct threshold_bank *b, char *buf)
+{
+	u32 high, low;
+	cpumask_t oldmask;
+	oldmask = affinity_set(b->cpu);
+	rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */
+	affinity_restore(oldmask);
+	return sprintf(buf, "%x\n",
+		       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+}
+
+static ssize_t store_error_count(struct threshold_bank *b,
+				 const char *buf, size_t count)
+{
+	cpumask_t oldmask;
+	oldmask = affinity_set(b->cpu);
+	threshold_restart_bank(b, 1, 0);
+	affinity_restore(oldmask);
+	return 1;
+}
+
+#define THRESHOLD_ATTR(_name,_mode,_show,_store) {            \
+        .attr = {.name = __stringify(_name), .mode = _mode }, \
+        .show = _show,                                        \
+        .store = _store,                                      \
+};
+
+#define ATTR_FIELDS(name) \
+        static struct threshold_attr name = \
+        THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
+
+ATTR_FIELDS(interrupt_enable);
+ATTR_FIELDS(threshold_limit);
+ATTR_FIELDS(error_count);
+
+static struct attribute *default_attrs[] = {
+	&interrupt_enable.attr,
+	&threshold_limit.attr,
+	&error_count.attr,
+	NULL
+};
+
+#define to_bank(k) container_of(k,struct threshold_bank,kobj)
+#define to_attr(a) container_of(a,struct threshold_attr,attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct threshold_bank *b = to_bank(kobj);
+	struct threshold_attr *a = to_attr(attr);
+	ssize_t ret;
+	ret = a->show ? a->show(b, buf) : -EIO;
+	return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+		     const char *buf, size_t count)
+{
+	struct threshold_bank *b = to_bank(kobj);
+	struct threshold_attr *a = to_attr(attr);
+	ssize_t ret;
+	ret = a->store ? a->store(b, buf, count) : -EIO;
+	return ret;
+}
+
+static struct sysfs_ops threshold_ops = {
+	.show = show,
+	.store = store,
+};
+
+static struct kobj_type threshold_ktype = {
+	.sysfs_ops = &threshold_ops,
+	.default_attrs = default_attrs,
+};
+
+/* symlinks sibling shared banks to first core.  first core owns dir/files. */
+static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
+{
+	int err = 0;
+	struct threshold_bank *b = 0;
+
+#ifdef CONFIG_SMP
+	if (cpu_core_id[cpu] && shared_bank[bank]) {	/* symlink */
+		char name[16];
+		unsigned lcpu = first_cpu(cpu_core_map[cpu]);
+		if (cpu_core_id[lcpu])
+			goto out;	/* first core not up yet */
+
+		b = per_cpu(threshold_banks, lcpu)[bank];
+		if (!b)
+			goto out;
+		sprintf(name, "bank%i", bank);
+		err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj,
+					&b->kobj, name);
+		if (err)
+			goto out;
+		per_cpu(threshold_banks, cpu)[bank] = b;
+		goto out;
+	}
+#endif
+
+	b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+	if (!b) {
+		err = -ENOMEM;
+		goto out;
+	}
+	memset(b, 0, sizeof(struct threshold_bank));
+
+	b->cpu = cpu;
+	b->bank = bank;
+	b->interrupt_enable = 0;
+	b->threshold_limit = THRESHOLD_MAX;
+	kobject_set_name(&b->kobj, "bank%i", bank);
+	b->kobj.parent = &per_cpu(device_threshold, cpu).kobj;
+	b->kobj.ktype = &threshold_ktype;
+
+	err = kobject_register(&b->kobj);
+	if (err) {
+		kfree(b);
+		goto out;
+	}
+	per_cpu(threshold_banks, cpu)[bank] = b;
+      out:
+	return err;
+}
+
+/* create dir/files for all valid threshold banks */
+static __cpuinit int threshold_create_device(unsigned int cpu)
+{
+	int bank;
+	int err = 0;
+
+	per_cpu(device_threshold, cpu).id = cpu;
+	per_cpu(device_threshold, cpu).cls = &threshold_sysclass;
+	err = sysdev_register(&per_cpu(device_threshold, cpu));
+	if (err)
+		goto out;
+
+	for (bank = 0; bank < NR_BANKS; ++bank) {
+		if (!(per_cpu(bank_map, cpu) & 1 << bank))
+			continue;
+		err = threshold_create_bank(cpu, bank);
+		if (err)
+			goto out;
+	}
+      out:
+	return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * let's be hotplug friendly.
+ * in case of multiple core processors, the first core always takes ownership
+ *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
+ */
+
+/* cpu hotplug call removes all symlinks before first core dies */
+static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
+{
+	struct threshold_bank *b;
+	char name[16];
+
+	b = per_cpu(threshold_banks, cpu)[bank];
+	if (!b)
+		return;
+	if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
+		sprintf(name, "bank%i", bank);
+		sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name);
+		per_cpu(threshold_banks, cpu)[bank] = 0;
+	} else {
+		kobject_unregister(&b->kobj);
+		kfree(per_cpu(threshold_banks, cpu)[bank]);
+	}
+}
+
+static __cpuinit void threshold_remove_device(unsigned int cpu)
+{
+	int bank;
+
+	for (bank = 0; bank < NR_BANKS; ++bank) {
+		if (!(per_cpu(bank_map, cpu) & 1 << bank))
+			continue;
+		threshold_remove_bank(cpu, bank);
+	}
+	sysdev_unregister(&per_cpu(device_threshold, cpu));
+}
+
+/* link all existing siblings when first core comes up */
+static __cpuinit int threshold_create_symlinks(unsigned int cpu)
+{
+	int bank, err = 0;
+	unsigned int lcpu = 0;
+
+	if (cpu_core_id[cpu])
+		return 0;
+	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
+		if (lcpu == cpu)
+			continue;
+		for (bank = 0; bank < NR_BANKS; ++bank) {
+			if (!(per_cpu(bank_map, cpu) & 1 << bank))
+				continue;
+			if (!shared_bank[bank])
+				continue;
+			err = threshold_create_bank(lcpu, bank);
+		}
+	}
+	return err;
+}
+
+/* remove all symlinks before first core dies. */
+static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
+{
+	int bank;
+	unsigned int lcpu = 0;
+	if (cpu_core_id[cpu])
+		return;
+	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
+		if (lcpu == cpu)
+			continue;
+		for (bank = 0; bank < NR_BANKS; ++bank) {
+			if (!(per_cpu(bank_map, cpu) & 1 << bank))
+				continue;
+			if (!shared_bank[bank])
+				continue;
+			threshold_remove_bank(lcpu, bank);
+		}
+	}
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static __cpuinit void threshold_create_symlinks(unsigned int cpu)
+{
+}
+static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
+{
+}
+static void threshold_remove_device(unsigned int cpu)
+{
+}
+#endif
+
+/* get notified when a cpu comes on/off */
+static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb,
+					    unsigned long action, void *hcpu)
+{
+	/* cpu was unsigned int to begin with */
+	unsigned int cpu = (unsigned long)hcpu;
+
+	if (cpu >= NR_CPUS)
+		goto out;
+
+	switch (action) {
+	case CPU_ONLINE:
+		threshold_create_device(cpu);
+		threshold_create_symlinks(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		threshold_remove_symlinks(cpu);
+		break;
+	case CPU_DOWN_FAILED:
+		threshold_create_symlinks(cpu);
+		break;
+	case CPU_DEAD:
+		threshold_remove_device(cpu);
+		break;
+	default:
+		break;
+	}
+      out:
+	return NOTIFY_OK;
+}
+
+static struct notifier_block threshold_cpu_notifier = {
+	.notifier_call = threshold_cpu_callback,
+};
+
+static __init int threshold_init_device(void)
+{
+	int err;
+	int lcpu = 0;
+
+	err = sysdev_class_register(&threshold_sysclass);
+	if (err)
+		goto out;
+
+	/* to hit CPUs online before the notifier is up */
+	for_each_online_cpu(lcpu) {
+		err = threshold_create_device(lcpu);
+		if (err)
+			goto out;
+	}
+	register_cpu_notifier(&threshold_cpu_notifier);
+
+      out:
+	return err;
+}
+
+device_initcall(threshold_init_device);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index b5e09e6b5536..4a836384dd0f 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -888,6 +888,10 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
 {
 }
 
+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+{
+}
+
 /*
  *  'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
-- 
cgit v1.2.2


From 6004e1b7effcbb385a6b7c790e4b8008682cf679 Mon Sep 17 00:00:00 2001
From: James Cleverdon <jamesclv@us.ibm.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] i386/x86-64: Share interrupt vectors when there is a large
 number of interrupt sources

Here's a patch that builds on Natalie Protasevich's IRQ compression
patch and tries to work for MPS boots as well as ACPI.  It is meant for
a 4-node IBM x460 NUMA box, which was dying because it had interrupt
pins with GSI numbers > NR_IRQS and thus overflowed irq_desc.

The problem is that this system has 270 GSIs (which are 1:1 mapped with
I/O APIC RTEs) and an 8-node box would have 540.  This is much bigger
than NR_IRQS (224 for both i386 and x86_64).  Also, there aren't enough
vectors to go around.  There are about 190 usable vectors, not counting
the reserved ones and the unused vectors at 0x20 to 0x2F.  So, my patch
attempts to compress the GSI range and share vectors by sharing IRQs.

Cc: "Protasevich, Natalie" <Natalie.Protasevich@unisys.com>

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 80 ++++++++++++++++++++++++++++++++++++++++----
 arch/x86_64/kernel/mpparse.c |  2 +-
 2 files changed, 74 insertions(+), 8 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c8eee20cd519..97154ab058b4 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -57,7 +57,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
  */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
@@ -85,6 +85,7 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 	int pin;							\
 	struct irq_pin_list *entry = irq_2_pin + irq;			\
 									\
+	BUG_ON(irq >= NR_IRQS);						\
 	for (;;) {							\
 		unsigned int reg;					\
 		pin = entry->pin;					\
@@ -127,6 +128,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 }
 #endif
 
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -137,6 +140,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	static int first_free_entry = NR_IRQS;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 
+	BUG_ON(irq >= NR_IRQS);
 	while (entry->next)
 		entry = irq_2_pin + entry->next;
 
@@ -144,7 +148,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 		entry->next = first_free_entry;
 		entry = irq_2_pin + entry->next;
 		if (++first_free_entry >= PIN_MAP_SIZE)
-			panic("io_apic.c: whoops");
+			panic("io_apic.c: ran out of irq_2_pin entries!");
 	}
 	entry->apic = apic;
 	entry->pin = pin;
@@ -420,6 +424,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 				best_guess = irq;
 		}
 	}
+	BUG_ON(best_guess >= NR_IRQS);
 	return best_guess;
 }
 
@@ -610,6 +615,64 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
+static int next_irq = 16;
+
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, tries, vector;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	if (platform_legacy_irq(gsi))
+		return gsi;
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return (int)gsi_2_irq[gsi];
+
+	tries = NR_IRQS;
+  try_again:
+	vector = assign_irq_vector(gsi);
+
+	/*
+	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+	 * use of vector and if found, return that IRQ.  However, we never want
+	 * to share legacy IRQs, which usually have a different trigger mode
+	 * than PCI.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector)
+			break;
+	if (platform_legacy_irq(i)) {
+		if (--tries >= 0) {
+			IO_APIC_VECTOR(i) = 0;
+			goto try_again;
+		}
+		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+	}
+	if (i < NR_IRQS) {
+		gsi_2_irq[gsi] = i;
+		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+				gsi, vector, i);
+		return i;
+	}
+
+	i = next_irq++;
+	BUG_ON(i >= NR_IRQS);
+	gsi_2_irq[gsi] = i;
+	IO_APIC_VECTOR(i) = vector;
+	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+			gsi, vector, i);
+	return i;
+}
+
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -639,6 +702,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
+			irq = gsi_irq_sharing(irq);
 			break;
 		}
 		default:
@@ -648,6 +712,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 			break;
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 
 	/*
 	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -663,6 +728,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 			}
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 	return irq;
 }
 
@@ -690,8 +756,8 @@ int assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
 
-	BUG_ON(irq >= NR_IRQ_VECTORS);
-	if (IO_APIC_VECTOR(irq) > 0)
+	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
 next:
 	current_vector += 8;
@@ -699,9 +765,8 @@ next:
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
@@ -1917,6 +1982,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	entry.polarity = active_high_low;
 	entry.mask = 1;					 /* Disabled (masked) */
 
+	irq = gsi_irq_sharing(irq);
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index f16d38d09daf..8f6958e79455 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -218,7 +218,7 @@ static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
-- 
cgit v1.2.2


From 47492d3667ec519172ab978bd8231b8c7152fa9d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Use the DMA32 zone for
 dma_alloc_coherent()/pci_alloc_consistent

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-gart.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 88be97c96987..2e28e855ec3c 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -220,6 +220,12 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	   uses the normal dma_mask for alloc_coherent. */
 	dma_mask &= *dev->dma_mask;
 
+	/* Why <=? Even when the mask is smaller than 4GB it is often larger 
+	   than 16MB and in this case we have a chance of finding fitting memory 
+	   in the next higher zone first. If not retry with true GFP_DMA. -AK */
+	if (dma_mask <= 0xffffffff)
+		gfp |= GFP_DMA32;
+
  again:
 	memory = dma_alloc_pages(dev, gfp, get_order(size));
 	if (memory == NULL)
@@ -245,7 +251,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 				}
 
 				if (!(gfp & GFP_DMA)) { 
-					gfp |= GFP_DMA; 
+					gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
 					goto again;
 				}
 				return NULL;
-- 
cgit v1.2.2


From e9b59d834faf0305cb6214a358f65f72c57b9e99 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Remove duplicate __cpuinit define

Remove duplicate __cpuinit in smp.c. Already defined in init.h which is
already included.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smp.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 9db9dda161b4..493a0d1f1279 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -28,8 +28,6 @@
 #include <asm/proto.h>
 #include <asm/apicdef.h>
 
-#define __cpuinit __init
-
 /*
  *	Smarter SMP flushing macros. 
  *		c/o Linus Torvalds.
-- 
cgit v1.2.2


From f5f786d0455c359c554b8f74783f887c0a2c9fac Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86-64/i386: Fix CPU model for family 6

According to cpuid instruction in IA32 SDM-Vol2, when computing cpu model,
we need to consider extended model ID for family 0x6 also.

AK: Also added fixes/simplifcation from Petr Vandrovec

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index da0bc3e7bdf5..f27731ac95c5 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -1060,10 +1060,10 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 		c->x86 = (tfms >> 8) & 0xf;
 		c->x86_model = (tfms >> 4) & 0xf;
 		c->x86_mask = tfms & 0xf;
-		if (c->x86 == 0xf) {
+		if (c->x86 == 0xf)
 			c->x86 += (tfms >> 20) & 0xff;
+		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		} 
 		if (c->x86_capability[0] & (1<<19)) 
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 	} else {
-- 
cgit v1.2.2


From 50895c5d76e15d8af480eff1aaab5770cabbc2c2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Fix gcc 4 warning in aperture.c

Fix

  arch/x86_64/kernel/aperture.c: In function #iommu_hole_init#:
  arch/x86_64/kernel/aperture.c:199: warning: #aper_order# may be used uninitialized in this function

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/aperture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 962ad4823b6a..c7f4fdd20f05 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -196,7 +196,7 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 void __init iommu_hole_init(void) 
 { 
 	int fix, num; 
-	u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
+	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base, last_aper_base = 0;
 	int valid_agp = 0;
 
-- 
cgit v1.2.2


From 69d81fcde7797342417591ba7affb372b9c86eae Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Speed up numa_node_id by putting it directly into the
 PDA

Not go from the CPU number to an mapping array.
Mode number is often used now in fast paths.

This also adds a generic numa_node_id to all the topology includes

Suggested by Eric Dumazet

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c |  4 ++--
 arch/x86_64/mm/numa.c      | 10 ++++++++--
 arch/x86_64/mm/srat.c      |  2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index f27731ac95c5..99cfa751949a 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -823,7 +823,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
  		if (!node_online(node))
  			node = nearby_node(apicid);
  	}
-  	cpu_to_node[cpu] = node;
+	numa_set_node(cpu, node);
 
   	printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
   			cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
@@ -975,7 +975,7 @@ static void srat_detect_node(void)
 	node = apicid_to_node[hard_smp_processor_id()];
 	if (node == NUMA_NO_NODE)
 		node = 0;
-	cpu_to_node[cpu] = node;
+	numa_set_node(cpu, node);
 
 	if (acpi_numa > 0)
 		printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 18e86e2eac2d..4bf64583ba3b 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -156,7 +156,7 @@ void __init numa_init_array(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] != NUMA_NO_NODE)
 			continue;
-		cpu_to_node[i] = rr;
+ 		numa_set_node(i, rr);
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
@@ -242,7 +242,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	nodes_clear(node_online_map);
 	node_set_online(0);
 	for (i = 0; i < NR_CPUS; i++)
-		cpu_to_node[i] = 0;
+		numa_set_node(i, 0);
 	node_to_cpumask[0] = cpumask_of_cpu(0);
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
@@ -252,6 +252,12 @@ __cpuinit void numa_add_cpu(int cpu)
 	set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
 } 
 
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	cpu_pda[cpu].nodenumber = node;
+	cpu_to_node[cpu] = node;
+}
+
 unsigned long __init numa_free_all_bootmem(void) 
 { 
 	int i;
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 4b2e844c15a7..c7aa08a58041 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -203,7 +203,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		if (cpu_to_node[i] == NUMA_NO_NODE)
 			continue;
 		if (!node_isset(cpu_to_node[i], nodes_parsed))
-			cpu_to_node[i] = NUMA_NO_NODE; 
+			numa_set_node(i, NUMA_NO_NODE);
 	}
 	numa_init_array();
 	return 0;
-- 
cgit v1.2.2


From f6c2e3330d3fdd5474bc3756da46fca889a30e33 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Unmap NULL during early bootup

We should zap the low mappings, as soon as possible, so that we can catch
kernel bugs more effectively. Previously early boot had NULL mapped
and didn't trap on NULL references.

This patch introduces boot_level4_pgt, which will always have low identity
addresses mapped.  Druing boot, all the processors will use this as their
level4 pgt.  On BP, we will switch to init_level4_pgt as soon as we enter C
code and zap the low mappings as soon as we are done with the usage of
identity low mapped addresses.  On AP's we will zap the low mappings as
soon as we jump to C code.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/head.S    | 37 +++++++++++++++++++++++--------------
 arch/x86_64/kernel/head64.c  |  8 ++++++++
 arch/x86_64/kernel/mpparse.c |  2 +-
 arch/x86_64/kernel/setup.c   |  2 ++
 arch/x86_64/kernel/setup64.c |  2 +-
 arch/x86_64/kernel/smpboot.c |  3 ---
 arch/x86_64/mm/init.c        | 28 +++++++++++++++++-----------
 7 files changed, 52 insertions(+), 30 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index b92e5f45ed46..15290968e49d 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -12,6 +12,7 @@
 
 #include <linux/linkage.h>
 #include <linux/threads.h>
+#include <linux/init.h>
 #include <asm/desc.h>
 #include <asm/segment.h>
 #include <asm/page.h>
@@ -70,7 +71,7 @@ startup_32:
 	movl	%eax, %cr4
 
 	/* Setup early boot stage 4 level pagetables */
-	movl	$(init_level4_pgt - __START_KERNEL_map), %eax
+	movl	$(boot_level4_pgt - __START_KERNEL_map), %eax
 	movl	%eax, %cr3
 
 	/* Setup EFER (Extended Feature Enable Register) */
@@ -113,7 +114,7 @@ startup_64:
 	movq	%rax, %cr4
 
 	/* Setup early boot stage 4 level pagetables. */
-	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
+	movq	$(boot_level4_pgt - __START_KERNEL_map), %rax
 	movq	%rax, %cr3
 
 	/* Check if nx is implemented */
@@ -240,20 +241,10 @@ ljumpvector:
 ENTRY(stext)
 ENTRY(_stext)
 
-	/*
-	 * This default setting generates an ident mapping at address 0x100000
-	 * and a mapping for the kernel that precisely maps virtual address
-	 * 0xffffffff80000000 to physical address 0x000000. (always using
-	 * 2Mbyte large pages provided by PAE mode)
-	 */
 .org 0x1000
 ENTRY(init_level4_pgt)
-	.quad	0x0000000000002007 + __PHYSICAL_START	/* -> level3_ident_pgt */
-	.fill	255,8,0
-	.quad	0x000000000000a007 + __PHYSICAL_START
-	.fill	254,8,0
-	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
+	/* This gets initialized in x86_64_start_kernel */
+	.fill	512,8,0
 
 .org 0x2000
 ENTRY(level3_ident_pgt)
@@ -350,6 +341,24 @@ ENTRY(wakeup_level4_pgt)
 	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
 #endif
 
+#ifndef CONFIG_HOTPLUG_CPU
+	__INITDATA
+#endif
+	/*
+	 * This default setting generates an ident mapping at address 0x100000
+	 * and a mapping for the kernel that precisely maps virtual address
+	 * 0xffffffff80000000 to physical address 0x000000. (always using
+	 * 2Mbyte large pages provided by PAE mode)
+	 */
+	.align PAGE_SIZE
+ENTRY(boot_level4_pgt)
+	.quad	0x0000000000002007 + __PHYSICAL_START	/* -> level3_ident_pgt */
+	.fill	255,8,0
+	.quad	0x000000000000a007 + __PHYSICAL_START
+	.fill	254,8,0
+	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
+
 	.data
 
 	.align 16
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index cf6ab147a2a5..e0f12d49db4c 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -19,6 +19,7 @@
 #include <asm/bootsetup.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
+#include <asm/pgtable.h>
 
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
    yet. */
@@ -86,6 +87,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
 		set_intr_gate(i, early_idt_handler);
 	asm volatile("lidt %0" :: "m" (idt_descr));
 	clear_bss();
+
+	/*
+	 * switch to init_level4_pgt from boot_level4_pgt
+	 */
+	memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
+	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+
 	pda_init(0);
 	copy_bootdata(real_mode_data);
 #ifdef CONFIG_SMP
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 8f6958e79455..4a581d1cefbd 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -549,7 +549,7 @@ void __init get_smp_config (void)
 		 * Read the physical hardware table.  Anything here will
 		 * override the defaults.
 		 */
-		if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) {
+		if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
 			smp_found_config = 0;
 			printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
 			printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 99cfa751949a..f76f3743f5d7 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -571,6 +571,8 @@ void __init setup_arch(char **cmdline_p)
 
 	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
 
+	zap_low_mappings(0);
+
 #ifdef CONFIG_ACPI
 	/*
 	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 79190891fbc5..06dc354375c3 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -141,7 +141,6 @@ void pda_init(int cpu)
 			panic("cannot allocate irqstack for cpu %d", cpu); 
 	}
 
-	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
 
 	pda->irqstackptr += IRQSTACKSIZE-64;
 } 
@@ -197,6 +196,7 @@ void __cpuinit cpu_init (void)
 	/* CPU 0 is initialised in head64.c */
 	if (cpu != 0) {
 		pda_init(cpu);
+		zap_low_mappings(cpu);
 	} else 
 		estacks = boot_exception_stacks; 
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 658a81b33f3b..3393fc08823b 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -1063,9 +1063,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
  */
 void __init smp_cpus_done(unsigned int max_cpus)
 {
-#ifndef CONFIG_HOTPLUG_CPU
-	zap_low_mappings();
-#endif
 	smp_cleanup_boot();
 
 #ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 2b1d6c382396..be483a1d7b54 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -312,12 +312,19 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
 
 extern struct x8664_pda cpu_pda[NR_CPUS];
 
-/* Assumes all CPUs still execute in init_mm */
-void zap_low_mappings(void)
+void __cpuinit zap_low_mappings(int cpu)
 {
-	pgd_t *pgd = pgd_offset_k(0UL);
-	pgd_clear(pgd);
-	flush_tlb_all();
+	if (cpu == 0) {
+		pgd_t *pgd = pgd_offset_k(0UL);
+		pgd_clear(pgd);
+	} else {
+		/*
+		 * For AP's, zap the low identity mappings by changing the cr3
+		 * to init_level4_pgt and doing local flush tlb all
+		 */
+		asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+	}
+	__flush_tlb_all();
 }
 
 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
@@ -474,14 +481,13 @@ void __init mem_init(void)
 		datasize >> 10,
 		initsize >> 10);
 
+#ifdef CONFIG_SMP
 	/*
-	 * Subtle. SMP is doing its boot stuff late (because it has to
-	 * fork idle threads) - but it also needs low mappings for the
-	 * protected-mode entry to work. We zap these entries only after
-	 * the WP-bit has been tested.
+	 * Sync boot_level4_pgt mappings with the init_level4_pgt
+	 * except for the low identity mappings which are already zapped
+	 * in init_level4_pgt. This sync-up is essential for AP's bringup
 	 */
-#ifndef CONFIG_SMP
-	zap_low_mappings();
+	memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
 #endif
 }
 
-- 
cgit v1.2.2


From 2bc0414ee04fd8bb798760801f5d7476dff44241 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Only use asm/sections.h to declare section symbols

Adding __initdata_* to asm-generic/sections.h
Replaces a lot of open coded externs in arch/x86_64/*
I had to change __bss_end to __bss_stop to match the other architectures.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/e820.c        |  3 +--
 arch/x86_64/kernel/head64.c      |  6 ++----
 arch/x86_64/kernel/setup.c       |  2 +-
 arch/x86_64/kernel/vmlinux.lds.S |  2 +-
 arch/x86_64/mm/init.c            | 10 ++--------
 5 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index ab3f87aaff70..17579a1a174b 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -23,8 +23,7 @@
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/bootsetup.h>
-
-extern char _end[];
+#include <asm/sections.h>
 
 /* 
  * PFN of last memory page.
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index e0f12d49db4c..b675c5add01e 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -20,14 +20,14 @@
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
+#include <asm/sections.h>
 
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
    yet. */
 static void __init clear_bss(void)
 {
-	extern char __bss_start[], __bss_end[];
 	memset(__bss_start, 0,
-	       (unsigned long) __bss_end - (unsigned long) __bss_start);
+	       (unsigned long) __bss_stop - (unsigned long) __bss_start);
 }
 
 #define NEW_CL_POINTER		0x228	/* Relative to real mode data */
@@ -76,8 +76,6 @@ static void __init setup_boot_cpu_data(void)
 	boot_cpu_data.x86_mask = eax & 0xf;
 }
 
-extern char _end[];
-
 void __init x86_64_start_kernel(char * real_mode_data)
 {
 	char *s;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index f76f3743f5d7..2ad3556dda52 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -60,6 +60,7 @@
 #include <asm/setup.h>
 #include <asm/mach_apic.h>
 #include <asm/numa.h>
+#include <asm/sections.h>
 
 /*
  * Machine setup..
@@ -103,7 +104,6 @@ struct edid_info edid_info;
 struct e820map e820;
 
 extern int root_mountflags;
-extern char _text, _etext, _edata, _end;
 
 char command_line[COMMAND_LINE_SIZE];
 
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 6dd642cad2ef..58b19215b4b3 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -50,7 +50,7 @@ SECTIONS
 	*(.bss.page_aligned)	
 	*(.bss)
 	}
-  __bss_end = .;
+  __bss_stop = .;
 
   . = ALIGN(PAGE_SIZE);
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index be483a1d7b54..489e18df1fe9 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -36,6 +36,7 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/smp.h>
+#include <asm/sections.h>
 
 #ifndef Dprintk
 #define Dprintk(x...)
@@ -45,8 +46,6 @@
 extern int swiotlb;
 #endif
 
-extern char _stext[];
-
 static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -88,9 +87,6 @@ void show_mem(void)
 
 /* References to section boundaries */
 
-extern char _text, _etext, _edata, __bss_start, _end[];
-extern char __init_begin, __init_end;
-
 int after_bootmem;
 
 static void *spp_getpage(void)
@@ -491,8 +487,6 @@ void __init mem_init(void)
 #endif
 }
 
-extern char __initdata_begin[], __initdata_end[];
-
 void free_initmem(void)
 {
 	unsigned long addr;
@@ -506,7 +500,7 @@ void free_initmem(void)
 		totalram_pages++;
 	}
 	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
-	printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
+	printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-- 
cgit v1.2.2


From 4d74dbd79a571b31f7cd2c69bb5e44368972bf93 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Replace cpu_pda extern with include

Minor cleanup - remove obsolete extern

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/init.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 489e18df1fe9..63f192cd7c1c 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -306,8 +306,6 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
 	       table_end<<PAGE_SHIFT);
 }
 
-extern struct x8664_pda cpu_pda[NR_CPUS];
-
 void __cpuinit zap_low_mappings(int cpu)
 {
 	if (cpu == 0) {
-- 
cgit v1.2.2


From 59170891049cc469777a392e0b3f5aa7aad784a6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Replace swiotlb extern with include

Minor victory on the continuous quest against all stray extern.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/init.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 63f192cd7c1c..854a41b8372b 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -22,6 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
+#include <linux/pci.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -42,10 +43,6 @@
 #define Dprintk(x...)
 #endif
 
-#ifdef CONFIG_GART_IOMMU
-extern int swiotlb;
-#endif
-
 static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-- 
cgit v1.2.2


From 420f8f68c9c5148dddf946bebdbc7eacde2172cb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: New heuristics to find out hotpluggable CPUs.

With a NR_CPUS==128 kernel with CPU hotplug enabled we would waste 4MB
on per CPU data of all possible CPUs.  The reason was that HOTPLUG
always set up possible map to NR_CPUS cpus and then we need to allocate
that much (each per CPU data is roughly ~32k now)

The underlying problem is that ACPI didn't tell us how many hotplug CPUs
the platform supports.  So the old code just assumed all, which would
lead to this memory wastage.

This implements some new heuristics:

 - If the BIOS specified disabled CPUs in the ACPI/mptables assume they
   can be enabled later (this is bending the ACPI specification a bit,
   but seems like a obvious extension)
 - The user can overwrite it with a new additionals_cpus=NUM option
 - Otherwise use half of the available CPUs or 2, whatever is more.

Cc: ashok.raj@intel.com
Cc: len.brown@intel.com

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mpparse.c |  8 ++++++--
 arch/x86_64/kernel/smpboot.c | 39 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 4a581d1cefbd..1d61f10a92c6 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -65,7 +65,9 @@ unsigned long mp_lapic_addr = 0;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_id = -1U;
 /* Internal processor count */
-static unsigned int num_processors = 0;
+unsigned int num_processors __initdata = 0;
+
+unsigned disabled_cpus __initdata;
 
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
@@ -109,8 +111,10 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 	int ver, cpu;
 	static int found_bsp=0;
 
-	if (!(m->mpc_cpuflag & CPU_ENABLED))
+	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+		disabled_cpus++;
 		return;
+	}
 
 	printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
 		m->mpc_apicid,
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 3393fc08823b..f74319a80659 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -880,6 +880,9 @@ static __init void disable_smp(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+int additional_cpus __initdata = -1;
+
 /*
  * cpu_possible_map should be static, it cannot change as cpu's
  * are onlined, or offlined. The reason is per-cpu data-structures
@@ -888,14 +891,38 @@ static __init void disable_smp(void)
  * cpu_present_map on the other hand can change dynamically.
  * In case when cpu_hotplug is not compiled, then we resort to current
  * behaviour, which is cpu_possible == cpu_present.
- * If cpu-hotplug is supported, then we need to preallocate for all
- * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
  * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - otherwise use half of the available CPUs or 2, whatever is more.
+ * - The user can overwrite it with additional_cpus=NUM
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
  */
 __init void prefill_possible_map(void)
 {
 	int i;
-	for (i = 0; i < NR_CPUS; i++)
+	int possible;
+
+ 	if (additional_cpus == -1) {
+ 		if (disabled_cpus > 0) {
+ 			additional_cpus = disabled_cpus;
+ 		} else {
+ 			additional_cpus = num_processors / 2;
+ 			if (additional_cpus == 0)
+ 				additional_cpus = 2;
+ 		}
+ 	}
+	possible = num_processors + additional_cpus;
+	if (possible > NR_CPUS) 
+		possible = NR_CPUS;
+
+	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+		possible,
+	        max_t(int, possible - num_processors, 0));
+
+	for (i = 0; i < possible; i++)
 		cpu_set(i, cpu_possible_map);
 }
 #endif
@@ -1151,6 +1178,12 @@ void __cpu_die(unsigned int cpu)
  	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
 
+static __init int setup_additional_cpus(char *s)
+{
+	return get_option(&s, &additional_cpus);
+}
+__setup("additional_cpus=", setup_additional_cpus);
+
 #else /* ... !CONFIG_HOTPLUG_CPU */
 
 int __cpu_disable(void)
-- 
cgit v1.2.2


From e4e5d324b9c5586f408a72d1534474c449dd5212 Mon Sep 17 00:00:00 2001
From: Bryan Ford <baford@mit.edu>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Save/restore CS in 64bit signal handlers and force
 __USER_CS for CS

This allows to run 64bit signal handlers in 64bit processes that run small
code snippets in compat mode.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index d642fbf3da29..5876df116c92 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -110,6 +110,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
 	COPY(r14);
 	COPY(r15);
 
+	/* Kernel saves and restores only the CS segment register on signals,
+	 * which is the bare minimum needed to allow mixed 32/64-bit code.
+	 * App's signal handler can save/restore other segments if needed. */
+	{
+		unsigned cs;
+		err |= __get_user(cs, &sc->cs);
+		regs->cs = cs | 3;	/* Force into user mode */
+	}
+
 	{
 		unsigned int tmpflags;
 		err |= __get_user(tmpflags, &sc->eflags);
@@ -187,6 +196,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 {
 	int err = 0;
 
+	err |= __put_user(regs->cs, &sc->cs);
 	err |= __put_user(0, &sc->gs);
 	err |= __put_user(0, &sc->fs);
 
@@ -318,7 +328,14 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 
 	regs->rsp = (unsigned long)frame;
 
+	/* Set up the CS register to run signal handlers in 64-bit mode,
+	   even if the handler happens to be interrupting 32-bit code. */
+	regs->cs = __USER_CS;
+
+	/* This, by contrast, has nothing to do with segment registers -
+	   see include/asm-x86_64/uaccess.h for details. */
 	set_fs(USER_DS);
+
 	regs->eflags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
-- 
cgit v1.2.2


From 529a340402e419f935d411ce0a085e96fcaf3872 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Optimize NUMA node hash function

Compute the highest possible value for memnode_shift, in order to reduce
footprint of memnodemap[] to the minimum, thus making all users
(phys_to_nid(), kfree()), more cache friendly.

Before the patch :

 Node 0 MemBase 0000000000000000 Limit 00000001ffffffff
 Node 1 MemBase 0000000200000000 Limit 00000003ffffffff
 Using 23 for the hash shift. Max adder is 3ffffffff

After the patch :

 Node 0 MemBase 0000000000000000 Limit 00000001ffffffff
 Node 1 MemBase 0000000200000000 Limit 00000003ffffffff
 Using 33 for the hash shift.

In this case, only 2 bytes of memnodemap[] are used, instead of 2048

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/numa.c | 67 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 24 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 4bf64583ba3b..edd5559380d3 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -38,38 +38,57 @@ cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
 
-int __init compute_hash_shift(struct node *nodes, int numnodes)
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init populate_memnodemap(
+	const struct node *nodes, int numnodes, int shift)
 {
 	int i; 
-	int shift = 20;
-	unsigned long addr,maxend=0;
-	
-	for (i = 0; i < numnodes; i++)
-		if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
-				maxend = nodes[i].end;
+	int res = -1;
+	unsigned long addr, end;
 
-	while ((1UL << shift) <  (maxend / NODEMAPSIZE))
-		shift++;
-
-	printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
-			shift,maxend);
-	memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
+	memset(memnodemap, 0xff, sizeof(memnodemap));
 	for (i = 0; i < numnodes; i++) {
-		if (nodes[i].start == nodes[i].end)
+		addr = nodes[i].start;
+		end = nodes[i].end;
+		if (addr >= end)
 			continue;
-		for (addr = nodes[i].start;
-		     addr < nodes[i].end;
-		     addr += (1UL << shift)) {
-			if (memnodemap[addr >> shift] != 0xff) {
-				printk(KERN_INFO
-	"Your memory is not aligned you need to rebuild your kernel "
-	"with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
-					shift,addr);
+		if ((end >> shift) >= NODEMAPSIZE)
+			return 0;
+		do {
+			if (memnodemap[addr >> shift] != 0xff)
 				return -1;
-			} 
 			memnodemap[addr >> shift] = i;
-		} 
+			addr += (1 << shift);
+		} while (addr < end);
+		res = 1;
 	} 
+	return res;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+	int shift = 20;
+
+	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+		shift++;
+
+	printk(KERN_DEBUG "Using %d for the hash shift.\n",
+		shift);
+
+	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+		printk(KERN_INFO
+	"Your memory is not aligned you need to rebuild your kernel "
+	"with a bigger NODEMAPSIZE shift=%d\n",
+			shift);
+		return -1;
+	}
 	return shift;
 }
 
-- 
cgit v1.2.2


From af9c142de94ecf724a18700273bbba390873e072 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Force correct address space size for MTRR on some
 64bit Intel Xeons

They report 40bit, but only have 36bits of physical address space.
This caused problems with setting up the correct masks for MTRR.

CPUID workaround for steppings 0F33h(supporting x86) and 0F34h(supporting x86
and EM64T). Detail info can be found at:
http://download.intel.com/design/Xeon/specupdt/30240216.pdf
http://download.intel.com/design/Pentium4/specupdt/30235221.pdf

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 2ad3556dda52..476ee034fca2 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -995,6 +995,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		unsigned eax = cpuid_eax(0x80000008);
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
+		/* CPUID workaround for Intel 0F34 CPU */
+		if (c->x86_vendor == X86_VENDOR_INTEL &&
+		    c->x86 == 0xF && c->x86_model == 0x3 &&
+		    c->x86_mask == 0x4)
+			c->x86_phys_bits = 36;
 	}
 
 	if (c->x86 == 15)
-- 
cgit v1.2.2


From ea0be473a1f0ee89024a24d8ea4b05fbf6efcee3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Allow modular build of ia32 aout loader

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig              | 2 +-
 arch/x86_64/ia32/ia32_aout.c     | 3 ---
 arch/x86_64/ia32/ia32_binfmt.c   | 4 +++-
 arch/x86_64/kernel/x8664_ksyms.c | 3 +++
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 504dc52e8bfa..fd10bf82f8d4 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -510,7 +510,7 @@ config IA32_EMULATION
 	  left.
 
 config IA32_AOUT
-       bool "IA32 a.out support"
+       tristate "IA32 a.out support"
        depends on IA32_EMULATION
        help
          Support old a.out binaries in the 32bit emulation.
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 93c60f4aa47a..3bf58af98936 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -36,9 +36,6 @@
 #undef WARN_OLD
 #undef CORE_DUMP /* probably broken */
 
-extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
-				unsigned long stack_top, int exec_stack);
-
 static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
 
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index d9161e395978..830feb272eca 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -335,7 +335,8 @@ static void elf32_init(struct pt_regs *regs)
 	me->thread.es = __USER_DS;
 }
 
-int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack)
+int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
+			 int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -389,6 +390,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int exec
 	
 	return 0;
 }
+EXPORT_SYMBOL(ia32_setup_arg_pages);
 
 static unsigned long
 elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index fd99ddd009bc..4a54221e10bc 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -203,3 +203,6 @@ EXPORT_SYMBOL(flush_tlb_page);
 #endif
 
 EXPORT_SYMBOL(cpu_khz);
+
+EXPORT_SYMBOL(load_gs_index);
+
-- 
cgit v1.2.2


From a88cde13bae3fffd6ecc812bdd02c91eafb6073e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Formatting fixes for arch/x86_64/kernel/process.c

No functional changes.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c | 47 +++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index b5a89c0bdf59..bc44c92ca198 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -148,7 +148,8 @@ void cpu_idle_wait(void)
 	do {
 		ssleep(1);
 		for_each_online_cpu(cpu) {
-			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+			if (cpu_isset(cpu, map) &&
+					!per_cpu(cpu_idle_state, cpu))
 				cpu_clear(cpu, map);
 		}
 		cpus_and(map, map, cpu_online_map);
@@ -278,7 +279,8 @@ void __show_regs(struct pt_regs * regs)
 		system_utsname.version);
 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
 	printk_address(regs->rip); 
-	printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+	printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
+		regs->eflags);
 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
 	       regs->rax, regs->rbx, regs->rcx);
 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -430,15 +432,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
 	struct pt_regs * childregs;
 	struct task_struct *me = current;
 
-	childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
-
+	childregs = ((struct pt_regs *)
+			(THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
 	*childregs = *regs;
 
 	childregs->rax = 0;
 	childregs->rsp = rsp;
-	if (rsp == ~0UL) {
+	if (rsp == ~0UL)
 		childregs->rsp = (unsigned long)childregs;
-	}
 
 	p->thread.rsp = (unsigned long) childregs;
 	p->thread.rsp0 = (unsigned long) (childregs+1);
@@ -460,7 +461,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
 			p->thread.io_bitmap_max = 0;
 			return -ENOMEM;
 		}
-		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+				IO_BITMAP_BYTES);
 	} 
 
 	/*
@@ -497,7 +499,8 @@ out:
  * - fold all the options into a flag word and test it with a single test.
  * - could test fs/gs bitsliced
  */
-struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
@@ -568,7 +571,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *
 	prev->userrsp = read_pda(oldrsp); 
 	write_pda(oldrsp, next->userrsp); 
 	write_pda(pcurrent, next_p); 
-	write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+	write_pda(kernelstack,
+	    (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
 
 	/*
 	 * Now maybe reload the debug registers
@@ -649,7 +653,9 @@ asmlinkage long sys_fork(struct pt_regs *regs)
 	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
 }
 
-asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+asmlinkage long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
 {
 	if (!newsp)
 		newsp = regs->rsp;
@@ -685,7 +691,8 @@ unsigned long get_wchan(struct task_struct *p)
 		return 0;
 	fp = *(u64 *)(p->thread.rsp);
 	do { 
-		if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+		if (fp < (unsigned long)stack ||
+		    fp > (unsigned long)stack+THREAD_SIZE)
 			return 0; 
 		rip = *(u64 *)(fp+8); 
 		if (!in_sched_functions(rip))
@@ -720,8 +727,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 			task->thread.gsindex = 0;
 			task->thread.gs = addr;
 			if (doit) {
-		load_gs_index(0);
-		ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 
+				load_gs_index(0);
+				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
 			} 
 		}
 		put_cpu();
@@ -738,7 +745,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 			set_32bit_tls(task, FS_TLS, addr);
 			if (doit) { 
 				load_TLS(&task->thread, cpu); 
-				asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
 			}
 			task->thread.fsindex = FS_TLS_SEL;
 			task->thread.fs = 0;
@@ -748,8 +755,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 			if (doit) {
 				/* set the selector to 0 to not confuse
 				   __switch_to */
-		asm volatile("movl %0,%%fs" :: "r" (0));
-		ret = checking_wrmsrl(MSR_FS_BASE, addr); 
+				asm volatile("movl %0,%%fs" :: "r" (0));
+				ret = checking_wrmsrl(MSR_FS_BASE, addr);
 			}
 		}
 		put_cpu();
@@ -758,9 +765,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		unsigned long base; 
 		if (task->thread.fsindex == FS_TLS_SEL)
 			base = read_32bit_tls(task, FS_TLS);
-		else if (doit) {
+		else if (doit)
 			rdmsrl(MSR_FS_BASE, base);
-		} else
+		else
 			base = task->thread.fs;
 		ret = put_user(base, (unsigned long __user *)addr); 
 		break; 
@@ -769,9 +776,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		unsigned long base;
 		if (task->thread.gsindex == GS_TLS_SEL)
 			base = read_32bit_tls(task, GS_TLS);
-		else if (doit) {
+		else if (doit)
 			rdmsrl(MSR_KERNEL_GS_BASE, base);
-		} else
+		else
 			base = task->thread.gs;
 		ret = put_user(base, (unsigned long __user *)addr); 
 		break;
-- 
cgit v1.2.2


From 3506229ff9968e5dbc862a50285fbea0e8821b58 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Don't enable interrupt unconditionally in reboot path

When they were disabled before (e.g. after a panic) it's better
to keep them off, otherwise followon panics can happen from timer
interrupt handlers etc.

Drawback is that pageup in the console won't work anymore though.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/reboot.c | 5 +++--
 arch/x86_64/kernel/smp.c    | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 47f95687905f..97804bc2c0a0 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -77,6 +77,7 @@ static inline void kb_wait(void)
 
 void machine_shutdown(void)
 {
+	unsigned long flags;
 	/* Stop the cpus and apics */
 #ifdef CONFIG_SMP
 	int reboot_cpu_id;
@@ -98,7 +99,7 @@ void machine_shutdown(void)
 	smp_send_stop();
 #endif
 
-	local_irq_disable();
+	local_irq_save(flags);
 
 #ifndef CONFIG_SMP
 	disable_local_APIC();
@@ -106,7 +107,7 @@ void machine_shutdown(void)
 
 	disable_IO_APIC();
 
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 void machine_emergency_restart(void)
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 493a0d1f1279..cfc3d9dccbd9 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -450,13 +450,14 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 
 void smp_stop_cpu(void)
 {
+	unsigned long flags;
 	/*
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_disable();
+	local_irq_save(flags);
 	disable_local_APIC();
-	local_irq_enable(); 
+	local_irq_restore(flags);
 }
 
 static void smp_really_stop_cpu(void *dummy)
-- 
cgit v1.2.2


From 94605eff572b727aaad9b4b29bc358b919096503 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86-64/i386: Intel HT, Multi core detection fixes

Fields obtained through cpuid vector 0x1(ebx[16:23]) and
vector 0x4(eax[14:25], eax[26:31]) indicate the maximum values and might not
always be the same as what is available and what OS sees.  So make sure
"siblings" and "cpu cores" values in /proc/cpuinfo reflect the values as seen
by OS instead of what cpuid instruction says. This will also fix the buggy BIOS
cases (for example where cpuid on a single core cpu says there are "2" siblings,
even when HT is disabled in the BIOS.
http://bugzilla.kernel.org/show_bug.cgi?id=4359)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c   | 69 +++++++++++++++++++-------------------------
 arch/x86_64/kernel/smpboot.c | 69 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 84 insertions(+), 54 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 476ee034fca2..40c77f6fe4b0 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -795,7 +795,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 #endif
 
 	bits = 0;
-	while ((1 << bits) < c->x86_num_cores)
+	while ((1 << bits) < c->x86_max_cores)
 		bits++;
 
 	/* Low order bits define the core id (index of core in socket) */
@@ -828,7 +828,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	numa_set_node(cpu, node);
 
   	printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
-  			cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
+  			cpu, c->x86_max_cores, node, cpu_core_id[cpu]);
 #endif
 #endif
 }
@@ -877,9 +877,9 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	display_cacheinfo(c);
 
 	if (c->extended_cpuid_level >= 0x80000008) {
-		c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
-		if (c->x86_num_cores & (c->x86_num_cores - 1))
-			c->x86_num_cores = 1;
+		c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+		if (c->x86_max_cores & (c->x86_max_cores - 1))
+			c->x86_max_cores = 1;
 
 		amd_detect_cmp(c);
 	}
@@ -891,54 +891,44 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	u32 	eax, ebx, ecx, edx;
-	int 	index_msb, tmp;
+	int 	index_msb, core_bits;
 	int 	cpu = smp_processor_id();
-	
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+
+	c->apicid = phys_pkg_id(0);
+
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
 
-	cpuid(1, &eax, &ebx, &ecx, &edx);
 	smp_num_siblings = (ebx & 0xff0000) >> 16;
-	
+
 	if (smp_num_siblings == 1) {
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
-		index_msb = 31;
-		/*
-		 * At this point we only support two siblings per
-		 * processor package.
-		 */
+	} else if (smp_num_siblings > 1 ) {
+
 		if (smp_num_siblings > NR_CPUS) {
 			printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
 			smp_num_siblings = 1;
 			return;
 		}
-		tmp = smp_num_siblings;
-		while ((tmp & 0x80000000 ) == 0) {
-			tmp <<=1 ;
-			index_msb--;
-		}
-		if (smp_num_siblings & (smp_num_siblings - 1))
-			index_msb++;
+
+		index_msb = get_count_order(smp_num_siblings);
 		phys_proc_id[cpu] = phys_pkg_id(index_msb);
-		
+
 		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
 		       phys_proc_id[cpu]);
 
-		smp_num_siblings = smp_num_siblings / c->x86_num_cores;
+		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
-		tmp = smp_num_siblings;
-		index_msb = 31;
-		while ((tmp & 0x80000000) == 0) {
-			tmp <<=1 ;
-			index_msb--;
-		}
-		if (smp_num_siblings & (smp_num_siblings - 1))
-			index_msb++;
+		index_msb = get_count_order(smp_num_siblings) ;
+
+		core_bits = get_count_order(c->x86_max_cores);
 
-		cpu_core_id[cpu] = phys_pkg_id(index_msb);
+		cpu_core_id[cpu] = phys_pkg_id(index_msb) &
+					       ((1 << core_bits) - 1);
 
-		if (c->x86_num_cores > 1)
+		if (c->x86_max_cores > 1)
 			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
 			       cpu_core_id[cpu]);
 	}
@@ -1006,7 +996,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 	if (c->x86 >= 15)
 		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
- 	c->x86_num_cores = intel_num_cpu_cores(c);
+ 	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
 }
@@ -1044,7 +1034,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_model_id[0] = '\0';  /* Unset */
 	c->x86_clflush_size = 64;
 	c->x86_cache_alignment = c->x86_clflush_size;
-	c->x86_num_cores = 1;
+	c->x86_max_cores = 1;
 	c->extended_cpuid_level = 0;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
@@ -1278,13 +1268,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
 	
 #ifdef CONFIG_SMP
-	if (smp_num_siblings * c->x86_num_cores > 1) {
+	if (smp_num_siblings * c->x86_max_cores > 1) {
 		int cpu = c - cpu_data;
 		seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
-		seq_printf(m, "siblings\t: %d\n",
-				c->x86_num_cores * smp_num_siblings);
+		seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
 		seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
-		seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores);
+		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	}
 #endif	
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f74319a80659..2b9ddba61b37 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -64,6 +64,7 @@
 int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
 u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+/* core ID of each logical CPU */
 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(phys_proc_id);
 EXPORT_SYMBOL(cpu_core_id);
@@ -89,7 +90,10 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
+/* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+
+/* representing HT and core siblings of each logical CPU */
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
@@ -436,30 +440,59 @@ void __cpuinit smp_callin(void)
 	cpu_set(cpuid, cpu_callin_map);
 }
 
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
 static inline void set_cpu_sibling_map(int cpu)
 {
 	int i;
+	struct cpuinfo_x86 *c = cpu_data;
+
+	cpu_set(cpu, cpu_sibling_setup_map);
 
 	if (smp_num_siblings > 1) {
-		for_each_cpu(i) {
-			if (cpu_core_id[cpu] == cpu_core_id[i]) {
+		for_each_cpu_mask(i, cpu_sibling_setup_map) {
+			if (phys_proc_id[cpu] == phys_proc_id[i] &&
+			    cpu_core_id[cpu] == cpu_core_id[i]) {
 				cpu_set(i, cpu_sibling_map[cpu]);
 				cpu_set(cpu, cpu_sibling_map[i]);
+				cpu_set(i, cpu_core_map[cpu]);
+				cpu_set(cpu, cpu_core_map[i]);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
-	if (current_cpu_data.x86_num_cores > 1) {
-		for_each_cpu(i) {
-			if (phys_proc_id[cpu] == phys_proc_id[i]) {
-				cpu_set(i, cpu_core_map[cpu]);
-				cpu_set(cpu, cpu_core_map[i]);
-			}
-		}
-	} else {
+	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
+		c[cpu].booted_cores = 1;
+		return;
+	}
+
+	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (phys_proc_id[cpu] == phys_proc_id[i]) {
+			cpu_set(i, cpu_core_map[cpu]);
+			cpu_set(cpu, cpu_core_map[i]);
+			/*
+			 *  Does this new cpu bringup a new core?
+			 */
+			if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+				/*
+				 * for each core in package, increment
+				 * the booted_cores for this new cpu
+				 */
+				if (first_cpu(cpu_sibling_map[i]) == i)
+					c[cpu].booted_cores++;
+				/*
+				 * increment the core count for all
+				 * the other cpus in this package
+				 */
+				if (i != cpu)
+					c[i].booted_cores++;
+			} else if (i != cpu && !c[cpu].booted_cores)
+				c[cpu].booted_cores = c[i].booted_cores;
+		}
 	}
 }
 
@@ -993,6 +1026,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	nmi_watchdog_default();
 	current_cpu_data = boot_cpu_data;
 	current_thread_info()->cpu = 0;  /* needed? */
+	set_cpu_sibling_map(0);
 
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
@@ -1036,8 +1070,6 @@ void __init smp_prepare_boot_cpu(void)
 	int me = smp_processor_id();
 	cpu_set(me, cpu_online_map);
 	cpu_set(me, cpu_callout_map);
-	cpu_set(0, cpu_sibling_map[0]);
-	cpu_set(0, cpu_core_map[0]);
 	per_cpu(cpu_state, me) = CPU_ONLINE;
 }
 
@@ -1106,15 +1138,24 @@ void __init smp_cpus_done(unsigned int max_cpus)
 static void remove_siblinginfo(int cpu)
 {
 	int sibling;
+	struct cpuinfo_x86 *c = cpu_data;
 
+	for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+		cpu_clear(cpu, cpu_core_map[sibling]);
+		/*
+		 * last thread sibling in this cpu core going down
+		 */
+		if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+			c[sibling].booted_cores--;
+	}
+			
 	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
 		cpu_clear(cpu, cpu_sibling_map[sibling]);
-	for_each_cpu_mask(sibling, cpu_core_map[cpu])
-		cpu_clear(cpu, cpu_core_map[sibling]);
 	cpus_clear(cpu_sibling_map[cpu]);
 	cpus_clear(cpu_core_map[cpu]);
 	phys_proc_id[cpu] = BAD_APICID;
 	cpu_core_id[cpu] = BAD_APICID;
+	cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
 void remove_cpu_from_maps(void)
-- 
cgit v1.2.2


From a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Reduce number of retries for reset through keyboard
 controller

Old code could retry for 10 seconds worst time. Only try it
for one second now.

Suggested by Yinghai Lu

Cc: Yinghai.Lu@amd.com

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/reboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 97804bc2c0a0..75235ed2b31b 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -121,7 +121,7 @@ void machine_emergency_restart(void)
 		/* Could also try the reset bit in the Hammer NB */
 		switch (reboot_type) { 
 		case BOOT_KBD:
-		for (i=0; i<100; i++) {
+		for (i=0; i<10; i++) {
 			kb_wait();
 			udelay(50);
 			outb(0xfe,0x64);         /* pulse reset low */
-- 
cgit v1.2.2


From a5b250a428aabc619ace872f8220a7d0b8f7d557 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Remove optimization for B stepping AMD K8

B stepping were the first shipping Opterons. memcpy/memset/copy_page/
clear_page had special optimized version for them. These are really
old and in the minority now and the difference to the generic versions
(using rep microcode) is not that big anyways. So just remove them.

TODO: figure out optimized versions for Intel Netburst based EM64T

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/lib/clear_page.S | 38 ------------------
 arch/x86_64/lib/copy_page.S  | 87 ----------------------------------------
 arch/x86_64/lib/memcpy.S     | 93 +------------------------------------------
 arch/x86_64/lib/memset.S     | 94 --------------------------------------------
 4 files changed, 2 insertions(+), 310 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 30a9da458c15..43d9fa136180 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -5,46 +5,8 @@
 	.globl clear_page
 	.p2align 4
 clear_page:
-	xorl   %eax,%eax
-	movl   $4096/64,%ecx
-	.p2align 4
-.Lloop:
-	decl	%ecx
-#define PUT(x) movq %rax,x*8(%rdi) 
-	movq %rax,(%rdi)
-	PUT(1)
-	PUT(2)
-	PUT(3)
-	PUT(4)
-	PUT(5)
-	PUT(6)
-	PUT(7)
-	leaq	64(%rdi),%rdi
-	jnz	.Lloop
-	nop
-	ret
-clear_page_end:	
-	
-	/* C stepping K8 run faster using the string instructions.
-	   It is also a lot simpler. Use this when possible */
-	
-#include <asm/cpufeature.h>
-	    	
-	.section .altinstructions,"a"
-	.align 8
-	.quad  clear_page
-	.quad  clear_page_c
-	.byte  X86_FEATURE_K8_C
-	.byte  clear_page_end-clear_page	
-	.byte  clear_page_c_end-clear_page_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
-clear_page_c:
 	movl $4096/8,%ecx
 	xorl %eax,%eax
 	rep 
 	stosq
 	ret
-clear_page_c_end:
-	.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index dd3aa47b6bf5..621a19769406 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,94 +8,7 @@
 	.globl copy_page
 	.p2align 4
 copy_page:
-	subq	$3*8,%rsp
-	movq	%rbx,(%rsp)
-	movq	%r12,1*8(%rsp)
-	movq	%r13,2*8(%rsp)
-			
-	movl	$(4096/64)-5,%ecx
-	.p2align 4
-.Loop64:	
-  	dec     %rcx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	prefetcht0 5*64(%rsi)
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-
-	leaq    64 (%rsi), %rsi
-	leaq    64 (%rdi), %rdi
-
-	jnz     .Loop64
-
-	movl	$5,%ecx
-	.p2align 4
-.Loop2:	
-	decl   %ecx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-	
-	leaq	64(%rdi),%rdi			
-	leaq	64(%rsi),%rsi			
-	
-	jnz	.Loop2		
-	
-	movq	(%rsp),%rbx
-	movq	1*8(%rsp),%r12
-	movq	2*8(%rsp),%r13
-	addq	$3*8,%rsp
-	ret
-	
-	/* C stepping K8 run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>		
-		
-	.section .altinstructions,"a"
-	.align 8
-	.quad  copy_page
-	.quad  copy_page_c
-	.byte  X86_FEATURE_K8_C
-	.byte  copy_page_c_end-copy_page_c
-	.byte  copy_page_c_end-copy_page_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
-copy_page_c:
 	movl $4096/8,%ecx
 	rep 
 	movsq 
 	ret
-copy_page_c_end:
-	.previous
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index c6c46494fef5..92dd80544602 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
  * 
  * Output:
  * rax original destination
+ * 
+ * TODO: check best memcpy for PSC
  */	
 
  	.globl __memcpy
@@ -18,95 +20,6 @@
 	.p2align 4
 __memcpy:
 memcpy:		
-	pushq %rbx
-	movq %rdi,%rax
-
-	movl %edx,%ecx
-	shrl $6,%ecx
-	jz .Lhandle_tail
-	
-	.p2align 4
-.Lloop_64:
-	decl %ecx
-	
-	movq (%rsi),%r11
-	movq 8(%rsi),%r8
-
-	movq %r11,(%rdi)
-	movq %r8,1*8(%rdi)
-
-	movq 2*8(%rsi),%r9
-	movq 3*8(%rsi),%r10
-
-	movq %r9,2*8(%rdi)
-	movq %r10,3*8(%rdi)
-		
-	movq 4*8(%rsi),%r11
-	movq 5*8(%rsi),%r8
-
-	movq %r11,4*8(%rdi)
-	movq %r8,5*8(%rdi)
-
-	movq 6*8(%rsi),%r9
-	movq 7*8(%rsi),%r10
-
-	movq %r9,6*8(%rdi)
-	movq %r10,7*8(%rdi)
-
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	jnz  .Lloop_64
-
-.Lhandle_tail:
-	movl %edx,%ecx
-	andl $63,%ecx
-	shrl $3,%ecx
-	jz   .Lhandle_7
-	.p2align 4
-.Lloop_8: 
-	decl %ecx
-	movq (%rsi),%r8
-	movq %r8,(%rdi) 
-	leaq 8(%rdi),%rdi
-	leaq 8(%rsi),%rsi
-	jnz  .Lloop_8
-
-.Lhandle_7:
-	movl %edx,%ecx
-	andl $7,%ecx
-	jz .Lende
-	.p2align 4
-.Lloop_1:
-	movb (%rsi),%r8b
-	movb %r8b,(%rdi) 
-	incq %rdi
-	incq %rsi
-	decl %ecx
-	jnz .Lloop_1
-	
-.Lende: 	
-	popq %rbx
-	ret
-.Lfinal:
-	
-	/* C stepping K8 run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-	
-	.section .altinstructions,"a"
-	.align 8
-	.quad  memcpy
-	.quad  memcpy_c
-	.byte  X86_FEATURE_K8_C
-	.byte  .Lfinal-memcpy
-	.byte  memcpy_c_end-memcpy_c	
-	.previous
-
-	.section .altinstr_replacement,"ax"
- /* rdi	destination
-  * rsi source
-  * rdx count
-  */			
-memcpy_c:
 	movq %rdi,%rax
 	movl %edx,%ecx
 	shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
 	rep
 	movsb
 	ret
-memcpy_c_end:
-	.previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 4b4c40638640..2aa48f24ed1e 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
 	.p2align 4
 memset:	
 __memset:
-	movq %rdi,%r10
-	movq %rdx,%r11
-
-	/* expand byte value  */
-	movzbl %sil,%ecx
-	movabs $0x0101010101010101,%rax
-	mul    %rcx		/* with rax, clobbers rdx */
-
-	/* align dst */
-	movl  %edi,%r9d		
-	andl  $7,%r9d	
-	jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-	
-	movl %r11d,%ecx
-	shrl $6,%ecx
-	jz	 .Lhandle_tail
-
-	.p2align 4
-.Lloop_64:	
-	decl   %ecx
-	movq  %rax,(%rdi) 
-	movq  %rax,8(%rdi) 
-	movq  %rax,16(%rdi) 
-	movq  %rax,24(%rdi) 
-	movq  %rax,32(%rdi) 
-	movq  %rax,40(%rdi) 
-	movq  %rax,48(%rdi) 
-	movq  %rax,56(%rdi) 
-	leaq  64(%rdi),%rdi
-	jnz    .Lloop_64
-
-	/* Handle tail in loops. The loops should be faster than hard
-	   to predict jump tables. */ 
-	.p2align 4	   
-.Lhandle_tail:
-	movl	%r11d,%ecx
-	andl    $63&(~7),%ecx
-	jz 		.Lhandle_7
-	shrl	$3,%ecx
-	.p2align 4
-.Lloop_8:
-	decl   %ecx
-	movq  %rax,(%rdi)
-	leaq  8(%rdi),%rdi
-	jnz    .Lloop_8
-
-.Lhandle_7:
-	movl	%r11d,%ecx
-	andl	$7,%ecx
-	jz      .Lende
-	.p2align 4
-.Lloop_1:
-	decl    %ecx
-	movb 	%al,(%rdi)
-	leaq	1(%rdi),%rdi
-	jnz     .Lloop_1
-	
-.Lende:	
-	movq	%r10,%rax
-	ret
-
-.Lbad_alignment:
-	cmpq $7,%r11
-	jbe	.Lhandle_7
-	movq %rax,(%rdi)	/* unaligned store */
-	movq $8,%r8			
-	subq %r9,%r8 
-	addq %r8,%rdi
-	subq %r8,%r11
-	jmp .Lafter_bad_alignment
-
-	/* C stepping K8 run faster using the string instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>	
-		
-	.section .altinstructions,"a"
-	.align 8
-	.quad  memset
-	.quad  memset_c
-	.byte  X86_FEATURE_K8_C
-	.byte  memset_c_end-memset_c
-	.byte  memset_c_end-memset_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
- /* rdi	destination
-  * rsi value
-  * rdx count
-  */			
-memset_c:	
 	movq %rdi,%r9
 	movl %edx,%r8d
 	andl $7,%r8d		
@@ -121,5 +29,3 @@ memset_c:
 	stosb
 	movq %r9,%rax
 	ret
-memset_c_end:
-	.previous
-- 
cgit v1.2.2


From efbbdce94f6ea54cf06d9a06e4c95f6874ad64a8 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Use common sys_time64

Keeping this function does not makes sense because it's a copied (and
buggy) copy of sys_time.  The only difference is that now.tv_sec (which is
a time_t, i.e.  a 64-bit long) is copied (and truncated) into a int
(32-bit).

The prototype is the same (they both take a long __user *), so let's drop
this and redirect it to sys_time (and make sure it exists by defining
__ARCH_WANT_SYS_TIME).

Only disadvantage is that the sys_stime definition is also compiled (may be
fixed if needed by adding a separate __ARCH_WANT_SYS_STIME macro, and
defining it for all arch's defining __ARCH_WANT_SYS_TIME except x86_64).

Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/sys_x86_64.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index cc7821c68851..6449ea8fe756 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -154,17 +154,3 @@ asmlinkage long sys_uname(struct new_utsname __user * name)
 		err |= copy_to_user(&name->machine, "i686", 5); 		
 	return err ? -EFAULT : 0;
 }
-
-asmlinkage long sys_time64(long __user * tloc)
-{
-	struct timeval now; 
-	int i; 
-
-	do_gettimeofday(&now);
-	i = now.tv_sec;
-	if (tloc) {
-		if (put_user(i,tloc))
-			i = -EFAULT;
-	}
-	return i;
-}
-- 
cgit v1.2.2


From b0bd35e622ffbda2c01dc67a0381c6a18817a29a Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Make ACPI NUMA and NUMA emulation peers of K8_NUMA in
 Kconfig

On x86_64 arches, there is no way to choose ACPI_NUMA without having to choose
K8_NUMA.  CONFIG_K8_NUMA is not needed for Intel EM64T NUMA boxes.  It also
looks odd if you have to select ACPI_NUMA from the power management menu.
This patch fixes those oddities.  Patch does the following:

1. Makes NUMA a config option like other arches
2. Makes topology detection options like K8_NUMA dependent on NUMA
3. Choosing ACPI NUMA detection can be done from the standard
   "Processor type and features" menu

AK: I fixed up the dependencies and changed the help texts a bit
on top of Kiran's patch.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index fd10bf82f8d4..1d6242a5cd0a 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -226,22 +226,42 @@ config SCHED_SMT
 
 source "kernel/Kconfig.preempt"
 
-config K8_NUMA
-       bool "K8 NUMA support"
-       select NUMA
+config NUMA
+       bool "Non Uniform Memory Access (NUMA) Support"
        depends on SMP
        help
-	  Enable NUMA (Non Unified Memory Architecture) support for
-	  AMD Opteron Multiprocessor systems. The kernel will try to allocate
-	  memory used by a CPU on the local memory controller of the CPU
-	  and add some more NUMA awareness to the kernel.
-	  This code is recommended on all multiprocessor Opteron systems
-	  and normally doesn't hurt on others.
+	 Enable NUMA (Non Uniform Memory Access) support. The kernel 
+	 will try to allocate memory used by a CPU on the local memory 
+	 controller of the CPU and add some more NUMA awareness to the kernel.
+	 This code is recommended on all multiprocessor Opteron systems.
+	 If the system is EM64T, you should say N unless your system is EM64T 
+	 NUMA. 
+
+config K8_NUMA
+       bool "Old style AMD Opteron NUMA detection"
+       depends on NUMA
+       default y
+       help
+	 Enable K8 NUMA node topology detection.  You should say Y here if
+	 you have a multi processor AMD K8 system. This uses an old
+	 method to read the NUMA configurtion directly from the builtin
+	 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
+	 instead, which also takes priority if both are compiled in.   
+
+# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
+
+config X86_64_ACPI_NUMA
+       bool "ACPI NUMA detection"
+       depends on NUMA
+       select ACPI 
+       select ACPI_NUMA
+       default y
+       help
+	 Enable ACPI SRAT based node topology detection.
 
 config NUMA_EMU
-	bool "NUMA emulation support"
-	select NUMA
-	depends on SMP
+	bool "NUMA emulation"
+	depends on NUMA
 	help
 	  Enable NUMA emulation. A flat machine will be split
 	  into virtual nodes when booted with "numa=fake=N", where N is the
@@ -252,9 +272,6 @@ config ARCH_DISCONTIGMEM_ENABLE
        depends on NUMA
        default y
 
-config NUMA
-       bool
-       default n
 
 config ARCH_DISCONTIGMEM_ENABLE
 	def_bool y
-- 
cgit v1.2.2


From e583538f077d5f70191670b47a046ba436ec3428 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Log machine checks from boot on Intel systems

The logging for boot errors was turned off because it was broken
on some AMD systems. But give Intel EM64T systems a chance because they are
supposed to be correct there.

The advantage is that there is a chance to actually log uncorrected
machine checks after the reset.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index cf8a76f0f47e..183dc6105429 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -37,7 +37,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
 static unsigned long console_logged;
 static int notify_user;
 static int rip_msr;
-static int mce_bootlog;
+static int mce_bootlog = 1;
 
 /*
  * Lockless MCE logging infrastructure.
@@ -347,7 +347,11 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 		/* disable GART TBL walk error reporting, which trips off 
 		   incorrectly with the IOMMU & 3ware & Cerberus. */
 		clear_bit(10, &bank[4]);
+		/* Lots of broken BIOS around that don't clear them
+		   by default and leave crap in there. Don't log. */
+		mce_bootlog = 0;
 	}
+
 }			
 
 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
@@ -498,16 +502,16 @@ static int __init mcheck_disable(char *str)
 /* mce=off disables machine check. Note you can reenable it later
    using sysfs.
    mce=TOLERANCELEVEL (number, see above)
-   mce=bootlog Log MCEs from before booting. Disabled by default to work
-   around buggy BIOS that leave bogus MCEs.  */
+   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+   mce=nobootlog Don't log MCEs from before booting. */
 static int __init mcheck_enable(char *str)
 {
 	if (*str == '=')
 		str++;
 	if (!strcmp(str, "off"))
 		mce_dont_init = 1;
-	else if (!strcmp(str, "bootlog"))
-		mce_bootlog = 1;
+	else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
+		mce_bootlog = str[0] == 'b';
 	else if (isdigit(str[0]))
 		get_option(&str, &tolerant);
 	else
-- 
cgit v1.2.2


From ffd10a2b77bca50dd05ba26acd5a6e68bcc8f61f Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus@valinux.co.jp>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Make node boundaries consistent

The current x86_64 NUMA memory code is inconsequent when it comes to node
memory ranges. The exact behaviour varies depending on which config option
that is used.

setup_node_bootmem() has start and end as arguments and these are used to
calculate the size of the node like this: (end - start). This is all fine
if end is pointing to the first non-available byte. The problem is that the
current x86_64 code sometimes treats it as the last present byte and sometimes
as the first non-available byte. The result is that some configurations might
lose a page at the end of the range.

This patch tries to fix CONFIG_ACPI_NUMA, CONFIG_K8_NUMA and CONFIG_NUMA_EMU
so they all treat the end variable as the first non-available byte. This is
the same way as the single node code.

The patch is boot tested on dual x86_64 hardware with the above configurations,
but maybe the removed code is needed as some workaround?

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/k8topology.c | 1 +
 arch/x86_64/mm/numa.c       | 2 --
 arch/x86_64/mm/srat.c       | 4 ----
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 65417b040c1b..a5663e0bb01c 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -108,6 +108,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		limit >>= 16; 
 		limit <<= 24; 
 		limit |= (1<<24)-1;
+		limit++;
 
 		if (limit > end_pfn << PAGE_SHIFT)
 			limit = end_pfn << PAGE_SHIFT;
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index edd5559380d3..629ff0621b3d 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -209,8 +209,6 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  		if (i == numa_fake-1)
  			sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
  		nodes[i].end = nodes[i].start + sz;
- 		if (i != numa_fake-1)
- 			nodes[i].end--;
  		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
  		       i,
  		       nodes[i].start, nodes[i].end,
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index c7aa08a58041..33340bd1e328 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -71,8 +71,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 			nd->start = nd->end;
 	}
 	if (nd->end > end) {
-		if (!(end & 0xfff))
-			end--;
 		nd->end = end;
 		if (nd->start > nd->end)
 			nd->start = nd->end;
@@ -166,8 +164,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		if (nd->end < end)
 			nd->end = end;
 	}
-	if (!(nd->end & 0xfff))
-		nd->end--;
 	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 	       nd->start, nd->end);
 }
-- 
cgit v1.2.2


From 9e43e1b7c7c9872da032442d8e4bb112a02d16f4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Remove CONFIG_CHECKING and add command line option
 for pagefault tracing

CONFIG_CHECKING covered some debugging code used in the early times
of the port. But it wasn't even SMP safe for quite some time
and the bugs it checked for seem to be gone.

This patch removes all the code to verify GS at kernel entry. There
haven't been any new bugs in this area for a long time.

Previously it also covered the sysctl for the page fault tracing.
That didn't make much sense because that code was unconditionally
compiled in. I made that a boot option now because it is typically
only useful at boot.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig.debug  |  9 ---------
 arch/x86_64/kernel/traps.c | 40 ----------------------------------------
 arch/x86_64/mm/fault.c     | 19 +++++++------------
 arch/x86_64/mm/init.c      |  4 ----
 4 files changed, 7 insertions(+), 65 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index 9cf1410d2f5a..3ccf6f4d1068 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -2,15 +2,6 @@ menu "Kernel hacking"
 
 source "lib/Kconfig.debug"
 
-# !SMP for now because the context switch early causes GPF in segment reloading
-# and the GS base checking does the wrong thing then, causing a hang.
-config CHECKING
-	bool "Additional run-time checks"
-	depends on DEBUG_KERNEL && !SMP
-	help
-	  Enables some internal consistency checks for kernel debugging.
-	  You should normally say N.
-
 config INIT_DEBUG
 	bool "Debug __init statements"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 4a836384dd0f..bf337f493189 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -428,19 +428,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 {
 	conditional_sti(regs);
 
-#ifdef CONFIG_CHECKING
-       { 
-               unsigned long gs; 
-               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
-               rdmsrl(MSR_GS_BASE, gs); 
-               if (gs != (unsigned long)pda) { 
-                       wrmsrl(MSR_GS_BASE, pda); 
-                       printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
-			      regs->rip);
-               }
-       }
-#endif
-
 	if (user_mode(regs)) {
 		struct task_struct *tsk = current;
 
@@ -513,20 +500,6 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 {
 	conditional_sti(regs);
 
-#ifdef CONFIG_CHECKING
-       { 
-               unsigned long gs; 
-               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
-               rdmsrl(MSR_GS_BASE, gs); 
-               if (gs != (unsigned long)pda) { 
-                       wrmsrl(MSR_GS_BASE, pda); 
-		       oops_in_progress++;
-                       printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
-		       oops_in_progress--;
-               }
-       }
-#endif
-
 	if (user_mode(regs)) {
 		struct task_struct *tsk = current;
 
@@ -665,19 +638,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 	struct task_struct *tsk = current;
 	siginfo_t info;
 
-#ifdef CONFIG_CHECKING
-       { 
-	       /* RED-PEN interaction with debugger - could destroy gs */
-               unsigned long gs; 
-               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
-               rdmsrl(MSR_GS_BASE, gs); 
-               if (gs != (unsigned long)pda) { 
-                       wrmsrl(MSR_GS_BASE, pda); 
-                       printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
-               }
-       }
-#endif
-
 	get_debugreg(condition, 6);
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index b75b872ec154..3a63707a698b 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -308,18 +308,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	unsigned long flags;
 	siginfo_t info;
 
-#ifdef CONFIG_CHECKING
-	{ 
-		unsigned long gs; 
-		struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); 
-		rdmsrl(MSR_GS_BASE, gs); 
-		if (gs != (unsigned long)pda) { 
-			wrmsrl(MSR_GS_BASE, pda); 
-			printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
-		}
-	}
-#endif
-
 	/* get the address */
 	__asm__("movq %%cr2,%0":"=r" (address));
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
@@ -571,3 +559,10 @@ do_sigbus:
 	force_sig_info(SIGBUS, &info, tsk);
 	return;
 }
+
+static int __init enable_pagefaulttrace(char *str)
+{
+	page_fault_trace = 1;
+	return 0;
+}
+__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 854a41b8372b..286f6a624c3a 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -565,10 +565,6 @@ extern int exception_trace, page_fault_trace;
 static ctl_table debug_table2[] = {
 	{ 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
 	  proc_dointvec },
-#ifdef CONFIG_CHECKING
-	{ 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
-	  proc_dointvec },
-#endif
 	{ 0, }
 }; 
 
-- 
cgit v1.2.2


From 8893166ff8694f36655009aa9bf8e7f2e1c9339f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Increase the maximum number of local APICs to the
 maximum

This is needed for large multinode IBM systems which have a sparse
APIC space in clustered mode, fully covering the available 8 bits.

The previous kernels would limit the local APIC number to 127,
which caused it to reject some of the CPUs at boot.

I increased the maximum and shrunk the apic_version array a bit
to make up for that (the version is only 8 bit, so don't need
an full int to store)

Cc:  Chris McDermott <lcm@us.ibm.com>

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mpparse.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 1d61f10a92c6..1105250bf02c 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -42,7 +42,7 @@ int acpi_found_madt;
  * Various Linux-internal data structures created from the
  * MP-table.
  */
-int apic_version [MAX_APICS];
+unsigned char apic_version [MAX_APICS];
 unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
 int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
 
@@ -108,7 +108,8 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 
 static void __init MP_processor_info (struct mpc_config_processor *m)
 {
-	int ver, cpu;
+	int cpu;
+	unsigned char ver;
 	static int found_bsp=0;
 
 	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
@@ -133,12 +134,14 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 	}
 
 	cpu = num_processors++;
-
-	if (m->mpc_apicid > MAX_APICS) {
+	
+#if MAX_APICS < 255	
+	if ((int)m->mpc_apicid > MAX_APICS) {
 		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
 			m->mpc_apicid, MAX_APICS);
 		return;
 	}
+#endif
 	ver = m->mpc_apicver;
 
 	physid_set(m->mpc_apicid, phys_cpu_present_map);
-- 
cgit v1.2.2


From d3ee871e63d0a0c70413dc0aa5534b8d6cd6ec37 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Fix sparse mem

Fix up booting with sparse mem enabled. Otherwise it would just
cause an early PANIC at boot.

Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c |  3 ---
 arch/x86_64/mm/numa.c      | 18 +++++++++++++++++-
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 40c77f6fe4b0..750e01dcbdf4 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -412,7 +412,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long bootmap_size, bootmap;
 
-	memory_present(0, start_pfn, end_pfn);
 	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
 	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
 	if (bootmap == -1L)
@@ -659,8 +658,6 @@ void __init setup_arch(char **cmdline_p)
 	}
 #endif
 
-	sparse_init();
-
 	paging_init();
 
 	check_ioapic();
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 629ff0621b3d..a828a01739cc 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -113,7 +113,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 	start_pfn = start >> PAGE_SHIFT;
 	end_pfn = end >> PAGE_SHIFT;
 
-	memory_present(nodeid, start_pfn, end_pfn);
 	nodedata_phys = find_e820_area(start, end, pgdat_size); 
 	if (nodedata_phys == -1L) 
 		panic("Cannot find memory pgdat in node %d\n", nodeid);
@@ -285,9 +284,26 @@ unsigned long __init numa_free_all_bootmem(void)
 	return pages;
 } 
 
+#ifdef CONFIG_SPARSEMEM
+static void __init arch_sparse_init(void)
+{
+	int i;
+
+	for_each_online_node(i)
+		memory_present(i, node_start_pfn(i), node_end_pfn(i));
+
+	sparse_init();
+}
+#else
+#define arch_sparse_init() do {} while (0)
+#endif
+
 void __init paging_init(void)
 { 
 	int i;
+
+	arch_sparse_init();
+
 	for_each_online_node(i) {
 		setup_node_zones(i); 
 	}
-- 
cgit v1.2.2


From 8bf1101bd52573e0573e374d56d2feecdbb5e444 Mon Sep 17 00:00:00 2001
From: Jim Keniston <jkenisto@us.ibm.com>
Date: Wed, 23 Nov 2005 13:37:42 -0800
Subject: [PATCH] kprobes: Fix return probes on sys_execve

Fix a bug in kprobes that can cause an Oops or even a crash when a return
probe is installed on one of the following functions: sys_execve,
do_execve, load_*_binary, flush_old_exec, or flush_thread.  The fix is to
remove the call to kprobe_flush_task() in flush_thread().  This fix has
been tested on all architectures for which the return-probes feature has
been implemented (i386, x86_64, ppc64, ia64).  Please apply.

BACKGROUND

Up to now, we have called kprobe_flush_task() under two situations: when a
task exits, and when it execs.  Flushing kretprobe_instances on exit is
correct because (a) do_exit() doesn't return, and (b) one or more
return-probed functions may be active when a task calls do_exit().  Neither
is the case for sys_execve() and its callees.

Initially, the mistaken call to kprobe_flush_task() on exec was harmless
because we put the "real" return address of each active probed function
back in the stack, just to be safe, when we recycled its
kretprobe_instance.  When support for ppc64 and ia64 was added, this safety
measure couldn't be employed, and was eventually dropped even for i386 and
x86_64.  sys_execve() and its callees were informally blacklisted for
return probes until this fix was developed.

Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/x86_64')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 5afd63e8cef7..7519fc520eb3 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -351,13 +351,6 @@ void flush_thread(void)
 	struct task_struct *tsk = current;
 	struct thread_info *t = current_thread_info();
 
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(tsk);
-
 	if (t->flags & _TIF_ABI_PENDING)
 		t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
 
-- 
cgit v1.2.2